Dorgtr.java

package org.mklab.sdpj.gpack.lapackwrap;

import org.mklab.nfc.matrix.ComplexNumericalMatrix;
import org.mklab.nfc.matrix.RealNumericalMatrix;
import org.mklab.nfc.scalar.ComplexNumericalScalar;
import org.mklab.nfc.scalar.RealNumericalScalar;
import org.mklab.sdpj.gpack.blaswrap.BLAS;


/**
 * ?SYTRDで求めた3重対角形を使って直行行列の一つを一般行列に乗算する。
 * 
 * @author takafumi 
   * @param <RS> type of real scalar
   * @param <RM> type of real matrix
   * @param <CS> type of complex scalar
   * @param <CM> type of complex Matrix
 */
public class Dorgtr<RS extends RealNumericalScalar<RS, RM, CS, CM>, RM extends RealNumericalMatrix<RS, RM, CS, CM>, CS extends ComplexNumericalScalar<RS, RM, CS, CM>, CM extends ComplexNumericalMatrix<RS, RM, CS, CM>> {

  /** */
  private RS[] a;
  /** */
  private RS[] tau;
  /** */
  private RS[] work;

  /*  -- LAPACK routine (version 3.0) --   
  Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
  Courant Institute, Argonne National Lab, and Rice University   
  June 30, 1999   


  Purpose   
  =======   

  DORGTR generates a real orthogonal matrix Q which is defined as the   
  product of n-1 elementary reflectors of order N, as returned by   
  DSYTRD:   

  if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),   

  if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).   

  Arguments   
  =========   

  UPLO    (input) CHARACTER*1   
       = 'U': Upper triangle of A contains elementary reflectors   
              from DSYTRD;   
       = 'L': Lower triangle of A contains elementary reflectors   
              from DSYTRD.   

  N       (input) INTEGER   
       The order of the matrix Q. N >= 0.   

  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)   
       On entry, the vectors which define the elementary reflectors,   
       as returned by DSYTRD.   
       On exit, the N-by-N orthogonal matrix Q.   

  LDA     (input) INTEGER   
       The leading dimension of the array A. LDA >= max(1,N).   

  TAU     (input) DOUBLE PRECISION array, dimension (N-1)   
       TAU(i) must contain the scalar factor of the elementary   
       reflector H(i), as returned by DSYTRD.   

  WORK    (workspace/output) DOUBLE PRECISION array, dimension (LWORK)   
       On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

  LWORK   (input) INTEGER   
       The dimension of the array WORK. LWORK >= max(1,N-1).   
       For optimum performance LWORK >= (N-1)*NB, where NB is   
       the optimal blocksize.   

       If LWORK = -1, then a workspace query is assumed; the routine   
       only calculates the optimal size of the WORK array, returns   
       this value as the first entry of the WORK array, and no error   
       message related to LWORK is issued by XERBLA.   

  INFO    (output) INTEGER   
       = 0:  successful exit   
       < 0:  if INFO = -i, the i-th argument had an illegal value   

  =====================================================================   
  */
  /**
   * @param uplo uplo
   * @param n n
   * @param at at
   * @param lda lda
   * @param taut taut
   * @param workt workt
   * @param lwork lwork
   * @return result
   */
  public int execute(String uplo, int n, RS[] at, int lda, RS[] taut, RS[] workt, int lwork) {
    final RS unit = at[0].createUnit();

    //debug work_indtauを監視
    this.a = at;
    this.tau = taut;
    this.work = workt;

    int lwkopt = 0;

    int a_dim1 = lda;
    int a_offset = 1 + a_dim1 * 1;
    int pointer_a = -a_offset;
    int pointer_work = -1;
    int pointer_tau = -1;

    int info = 0;
    boolean lquery = lwork == -1;
    boolean upper = BLAS.lsame(uplo, "U"); //$NON-NLS-1$
    if (!upper && !BLAS.lsame(uplo, "L")) { //$NON-NLS-1$
      info = -1;
    } else if (n < 0) {
      info = -2;
    } else if (lda < Math.max(1, n)) {
      info = -4;
    } else /* if(complicated condition) */{
      if (lwork < Math.max(1, n - 1) && !lquery) {
        info = -7;
      }
    }

    if (info == 0) {
      int nb;
      if (upper) {
        nb = Clapack.ilaenv(1, "DORGQL", " ", n - 1, n - 1, n - 1, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
      } else {
        nb = Clapack.ilaenv(1, "DORGQR", " ", n - 1, n - 1, n - 1, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
      }
      lwkopt = Math.max(1, n - 1) * nb;
      this.work[1 + pointer_work] = unit.create(lwkopt);
    }

    if (info != 0) {
      BLAS.xerbla("DORGTR", -info); //$NON-NLS-1$
      return info;
    } else if (lquery) {
      return info;
    }

    /* Quick return if possible */
    if (n == 0) {
      this.work[1 + pointer_work] = unit.create(1);
      return info;
    }

    if (upper) {
      /* Q was determined by a call to DSYTRD with UPLO = 'U'   
         Shift the vectors which define the elementary reflectors one   
         column to the left, and set the last row and column of Q to   
         those of the unit matrix */
      for (int j = 1; j <= n - 1; ++j) {
        for (int i = 1; i <= j - 1; ++i) {
          this.a[j * a_dim1 + i + pointer_a] = this.a[(j + 1) * a_dim1 + i + pointer_a];
        }
        this.a[j * a_dim1 + n + pointer_a] = unit.create(0);
      }

      for (int i = 1; i <= n - 1; ++i) {
        this.a[n * a_dim1 + i + pointer_a] = unit.create(0);
      }

      this.a[n * a_dim1 + n + pointer_a] = unit.create(1);

      /* Generate Q(1:n-1,1:n-1) */
      info = dorgql_(n - 1, n - 1, n - 1, a_offset + pointer_a, lda, 1 + pointer_tau, 1 + pointer_work, lwork);
    } else {
      /* Q was determined by a call to DSYTRD with UPLO = 'L'.   
         Shift the vectors which define the elementary reflectors one   
         column to the right, and set the first row and column of Q to   
         those of the unit matrix */
      for (int j = n; j >= 2; --j) {
        this.a[j * a_dim1 + 1 + pointer_a] = unit.create(0);
        for (int i = j + 1; i <= n; ++i) {
          this.a[j * a_dim1 + i + pointer_a] = this.a[(j - 1) * a_dim1 + i + pointer_a];
          /* L40: */
        }
        /* L50: */
      }
      this.a[1 * a_dim1 + 1 + pointer_a] = unit.create(1);

      for (int i = 2; i <= n; ++i) {
        this.a[1 * a_dim1 + i + pointer_a] = unit.create(0);
        /* L60: */
      }
      if (n > 1) {
        /*           Generate Q(2:n,2:n) */
        info = dorgqr_(n - 1, n - 1, n - 1, 2 * a_dim1 + 2 + pointer_a, lda, 1 + pointer_tau, 1 + pointer_work, lwork);
      }
    }
    this.work[1 + pointer_work] = unit.create(lwkopt);
    return info;
  }

  /*  -- LAPACK routine (version 3.0) --   
  Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
  Courant Institute, Argonne National Lab, and Rice University   
  June 30, 1999   


  Purpose   
  =======   

  DORGQL generates an M-by-N real matrix Q with orthonormal columns,   
  which is defined as the last N columns of a product of K elementary   
  reflectors of order M   

     Q  =  H(k) . . . H(2) H(1)   

  as returned by DGEQLF.   

  Arguments   
  =========   

  M       (input) INTEGER   
       The number of rows of the matrix Q. M >= 0.   

  N       (input) INTEGER   
       The number of columns of the matrix Q. M >= N >= 0.   

  K       (input) INTEGER   
       The number of elementary reflectors whose product defines the   
       matrix Q. N >= K >= 0.   

  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)   
       On entry, the (n-k+i)-th column must contain the vector which   
       defines the elementary reflector H(i), for i = 1,2,...,k, as   
       returned by DGEQLF in the last k columns of its array   
       argument A.   
       On exit, the M-by-N matrix Q.   

  LDA     (input) INTEGER   
       The first dimension of the array A. LDA >= max(1,M).   

  TAU     (input) DOUBLE PRECISION array, dimension (K)   
       TAU(i) must contain the scalar factor of the elementary   
       reflector H(i), as returned by DGEQLF.   

  WORK    (workspace/output) DOUBLE PRECISION array, dimension (LWORK)   
       On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

  LWORK   (input) INTEGER   
       The dimension of the array WORK. LWORK >= max(1,N).   
       For optimum performance LWORK >= N*NB, where NB is the   
       optimal blocksize.   

       If LWORK = -1, then a workspace query is assumed; the routine   
       only calculates the optimal size of the WORK array, returns   
       this value as the first entry of the WORK array, and no error   
       message related to LWORK is issued by XERBLA.   

  INFO    (output) INTEGER   
       = 0:  successful exit   
       < 0:  if INFO = -i, the i-th argument has an illegal value   

  =====================================================================   
   */
  /**
   * Dorgtrのサブルーチンとして作成しているため、別のクラスから利用する場合は 被せて作る必要有り。
   * 
   * @param m m
   * @param n n
   * @param k k
   * @param aIndex aIndex
   * @param lda lda
   * @param tauIndex tauIndex
   * @param workIndex workIndex
   * @param lwork lwork
   * @return result
   */
  private int dorgql_(int m, int n, int k, int aIndex, int lda, int tauIndex, int workIndex, int lwork) {
    final RS unit = this.a[0].createUnit();

    int a_dim1 = lda;
    int a_offset = 1 + a_dim1 * 1;
    int pointer_a = aIndex - a_offset;
    int pointer_tau = tauIndex - 1;
    int pointer_work = workIndex - 1;

    int info = 0;
    int nb = Clapack.ilaenv(1, "DORGQL", " ", m, n, k, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
    int lwkopt = Math.max(1, n) * nb;
    this.work[1 + pointer_work] = unit.create(lwkopt);
    boolean lquery = lwork == -1;
    if (m < 0) {
      info = -1;
    } else if (n < 0 || n > m) {
      info = -2;
    } else if (k < 0 || k > n) {
      info = -3;
    } else if (lda < Math.max(1, m)) {
      info = -5;
    } else if (lwork < Math.max(1, n) && !lquery) {
      info = -8;
    }
    if (info != 0) {
      BLAS.xerbla("DORGQL", -info); //$NON-NLS-1$
      return info;
    } else if (lquery) {
      return info;
    }

    /* Quick return if possible */
    if (n <= 0) {
      this.work[1 + pointer_work] = unit.create(1);
      return info;
    }

    int nbmin = 2;
    int nx = 0;
    int ldwork = 0;
    int iws = n;
    if (nb > 1 && nb < k) {
      /* Determine when to cross over from blocked to unblocked code. Computing MAX */
      int i2 = Clapack.ilaenv(3, "DORGQL", " ", m, n, k, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
      nx = Math.max(0, i2);
      if (nx < k) {
        /* Determine if workspace is large enough for blocked code. */
        ldwork = n;
        iws = ldwork * nb;
        if (lwork < iws) {
          /* Not enough workspace to use optimal NB:  reduce NB and   
             determine the minimum value of NB. */
          nb = lwork / ldwork;
          /* Computing MAX */
          i2 = Clapack.ilaenv(2, "DORGQL", " ", m, n, k, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
          nbmin = Math.max(2, i2);
        }
      }
    }

    int kk;
    if (nb >= nbmin && nb < k && nx < k) {
      int i2 = (k - nx + nb - 1) / nb * nb;
      kk = Math.min(k, i2);

      /* Set A(m-kk+1:m,1:n-kk) to zero. */
      for (int j = 1; j <= n - kk; ++j) {
        for (int i = m - kk + 1; i <= m; ++i) {
          this.a[j * a_dim1 + i + pointer_a] = unit.create(0);
        }
      }
    } else {
      kk = 0;
    }

    /* Use unblocked code for the first or only block. */
    info = dorg2l_(m - kk, n - kk, k - kk, a_offset + pointer_a, lda, 1 + pointer_tau, 1 + pointer_work);

    if (kk > 0) {
      /* Use blocked code */

      for (int i = k - kk + 1; nb < 0 ? i >= k : i <= k; i += nb) {
        /* Computing MIN */
        int ib = Math.min(nb, k - i + 1);
        if (n - k + i > 1) {
          /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */

          int i3 = m - k + i + ib - 1;
          dlarft_("Backward", "Columnwise", i3, ib, (n - k + i) * a_dim1 + 1 + pointer_a, lda, i + pointer_tau, 1 + pointer_work, ldwork); //$NON-NLS-1$ //$NON-NLS-2$

          /* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left */

          i3 = m - k + i + ib - 1;
          int i4 = n - k + i - 1;
          dlarfb_("Left", "No transpose", "Backward", "Columnwise", i3, i4, ib, (n - k + i) * a_dim1 + 1 + pointer_a, lda, 1 + pointer_work, ldwork, a_offset + pointer_a, lda, ib + 1 //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
              + pointer_work, ldwork);
        }

        /* Apply H to rows 1:m-k+i+ib-1 of current block */
        info = dorg2l_(m - k + i + ib - 1, ib, ib, (n - k + i) * a_dim1 + 1 + pointer_a, lda, i + pointer_tau, 1 + pointer_work);

        /* Set rows m-k+i+ib:m of current block to zero */
        for (int j = n - k + i; j <= n - k + i + ib - 1; ++j) {
          for (int l = m - k + i + ib; l <= m; ++l) {
            this.a[j * a_dim1 + l + pointer_a] = unit.create(0);
          }
        }
      }
    }

    this.work[1 + pointer_work] = unit.create(iws);
    return info;
  }

  /*  -- LAPACK routine (version 3.0) --   
  Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
  Courant Institute, Argonne National Lab, and Rice University   
  February 29, 1992   


  Purpose   
  =======   

  DORG2L generates an m by n real matrix Q with orthonormal columns,   
  which is defined as the last n columns of a product of k elementary   
  reflectors of order m   

     Q  =  H(k) . . . H(2) H(1)   

  as returned by DGEQLF.   

  Arguments   
  =========   

  M       (input) INTEGER   
       The number of rows of the matrix Q. M >= 0.   

  N       (input) INTEGER   
       The number of columns of the matrix Q. M >= N >= 0.   

  K       (input) INTEGER   
       The number of elementary reflectors whose product defines the   
       matrix Q. N >= K >= 0.   

  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)   
       On entry, the (n-k+i)-th column must contain the vector which   
       defines the elementary reflector H(i), for i = 1,2,...,k, as   
       returned by DGEQLF in the last k columns of its array   
       argument A.   
       On exit, the m by n matrix Q.   

  LDA     (input) INTEGER   
       The first dimension of the array A. LDA >= max(1,M).   

  TAU     (input) DOUBLE PRECISION array, dimension (K)   
       TAU(i) must contain the scalar factor of the elementary   
       reflector H(i), as returned by DGEQLF.   

  WORK    (workspace) DOUBLE PRECISION array, dimension (N)   

  INFO    (output) INTEGER   
       = 0: successful exit   
       < 0: if INFO = -i, the i-th argument has an illegal value   

  =====================================================================   
  */
  /**
   * @param m m
   * @param n n
   * @param k k
   * @param aIndex aIndex
   * @param lda lda
   * @param tauIndex tauIndex
   * @param workIndex workIndex
   * @return result
   */
  private int dorg2l_(int m, int n, int k, int aIndex, int lda, int tauIndex, int workIndex) {
    final RS unit = this.a[0].createUnit();

    int a_dim1 = lda;
    int a_offset = 1 + a_dim1 * 1;
    int pointer_a = aIndex - a_offset;
    int pointer_tau = tauIndex - 1;
    int pointer_work = workIndex - 1;

    int info = 0;
    if (m < 0) {
      info = -1;
    } else if (n < 0 || n > m) {
      info = -2;
    } else if (k < 0 || k > n) {
      info = -3;
    } else if (lda < Math.max(1, m)) {
      info = -5;
    }
    if (info != 0) {
      BLAS.xerbla("DORG2L", -info); //$NON-NLS-1$
      return info;
    }

    /* Quick return if possible */
    if (n <= 0) {
      return info;
    }

    /* Initialize columns 1:n-k to columns of the unit matrix */
    for (int j = 1; j <= n - k; ++j) {
      for (int l = 1; l <= m; ++l) {
        this.a[(j) * a_dim1 + l + pointer_a] = unit.create(0);
      }
      this.a[(j) * a_dim1 + m - n + j + pointer_a] = unit.create(1);
    }

    for (int i = 1; i <= k; ++i) {
      int ii = n - k + i;

      /* Apply H(i) to A(1:m-k+i,1:n-k+i) from the left */
      this.a[ii * a_dim1 + m - n + ii + pointer_a] = unit.create(1);
      dlarf_("Left", m - n + ii, ii - 1, (ii) * a_dim1 + 1 + pointer_a, 1, this.tau[i + pointer_tau], a_offset + pointer_a, lda, 1 + pointer_work); //$NON-NLS-1$
      RS d1 = this.tau[i + pointer_tau].unaryMinus();

      //array copy
      RS[] aTemp = unit.createArray(this.a.length - (ii * a_dim1 + 1 + pointer_a));
      System.arraycopy(this.a, ii * a_dim1 + 1 + pointer_a, aTemp, 0, aTemp.length);
      BLAS.dscal(m - n + ii - 1, d1, aTemp, 1);
      System.arraycopy(aTemp, 0, this.a, ii * a_dim1 + 1 + pointer_a, aTemp.length);

      this.a[ii * a_dim1 + m - n + ii + pointer_a] = unit.create(1).subtract(this.tau[i + pointer_tau]);

      /* Set A(m-k+i+1:m,n-k+i) to zero */
      for (int l = m - n + ii + 1; l <= m; ++l) {
        this.a[(ii) * a_dim1 + l + pointer_a] = unit.create(0);
      }
    }

    return info;
  }

  /*  -- LAPACK auxiliary routine (version 3.0) --   
  Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
  Courant Institute, Argonne National Lab, and Rice University   
  February 29, 1992   


  Purpose   
  =======   

  DLARF applies a real elementary reflector H to a real m by n matrix   
  C, from either the left or the right. H is represented in the form   

     H = I - tau * v * v'   

  where tau is a real scalar and v is a real vector.   

  If tau = 0, then H is taken to be the unit matrix.   

  Arguments   
  =========   

  SIDE    (input) CHARACTER*1   
       = 'L': form  H * C   
       = 'R': form  C * H   

  M       (input) INTEGER   
       The number of rows of the matrix C.   

  N       (input) INTEGER   
       The number of columns of the matrix C.   

  V       (input) DOUBLE PRECISION array, dimension   
                  (1 + (M-1)*abs(INCV)) if SIDE = 'L'   
               or (1 + (N-1)*abs(INCV)) if SIDE = 'R'   
       The vector v in the representation of H. V is not used if   
       TAU = 0.   

  INCV    (input) INTEGER   
       The increment between elements of v. INCV <> 0.   

  TAU     (input) DOUBLE PRECISION   
       The value tau in the representation of H.   

  C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)   
       On entry, the m by n matrix C.   
       On exit, C is overwritten by the matrix H * C if SIDE = 'L',   
       or C * H if SIDE = 'R'.   

  LDC     (input) INTEGER   
       The leading dimension of the array C. LDC >= max(1,M).   

  WORK    (workspace) DOUBLE PRECISION array, dimension   
                      (N) if SIDE = 'L'   
                   or (M) if SIDE = 'R'   

  =====================================================================   
  */
  /**
   * @param side side
   * @param m m
   * @param n n
   * @param vIndex vIndex
   * @param incv incv
   * @param tau tau
   * @param cIndex cIndex
   * @param ldc ldc
   * @param workIndex workIndex
   * @return result
   */
  private int dlarf_(String side, int m, int n, int vIndex, int incv, RS tau, int cIndex, int ldc, int workIndex) {
    final RS unit = this.a[0].createUnit();

    RS[] v = this.a;
    RS[] c = this.a;

    RS c_b4 = unit.createUnit();
    RS c_b5 = unit.createZero();

    int pointer_v = vIndex - 1;
    int c_dim1 = ldc;
    int c_offset = 1 + c_dim1 * 1;
    int pointer_c = cIndex - c_offset;
    int pointer_work = workIndex - 1;

    if (BLAS.lsame(side, "L")) { //$NON-NLS-1$
      /* Form  H * C */
      if (!tau.isZero()) {
        //array copy
        RS[] cTemp = unit.createArray(c.length - (c_offset + pointer_c));
        RS[] workTemp = unit.createArray(this.work.length - (1 + pointer_work));
        RS[] vTemp = unit.createArray(v.length - (1 + pointer_v));
        System.arraycopy(c, c_offset + pointer_c, cTemp, 0, cTemp.length);
        System.arraycopy(this.work, 1 + pointer_work, workTemp, 0, workTemp.length);
        System.arraycopy(v, 1 + pointer_v, vTemp, 0, vTemp.length);

        /* w := C' * v */
        BLAS.dgemv("Transpose", m, n, c_b4, cTemp, ldc, vTemp, incv, c_b5, workTemp, 1); //$NON-NLS-1$

        BLAS.dger(m, n, tau.unaryMinus(), vTemp, incv, workTemp, 1, cTemp, ldc);
        System.arraycopy(cTemp, 0, c, c_offset + pointer_c, cTemp.length);
        System.arraycopy(workTemp, 0, this.work, 1 + pointer_work, workTemp.length);
        System.arraycopy(vTemp, 0, v, 1 + pointer_v, vTemp.length);
      }
    } else {
      /* Form  C * H */
      if (!tau.isZero()) {
        //array copy
        RS[] cTemp = unit.createArray(c.length - (c_offset + pointer_c));
        RS[] workTemp = unit.createArray(this.work.length - (1 + pointer_work));
        RS[] vTemp = unit.createArray(v.length - (1 + pointer_v));
        System.arraycopy(c, c_offset + pointer_c, cTemp, 0, cTemp.length);
        System.arraycopy(this.work, 1 + pointer_work, workTemp, 0, workTemp.length);
        System.arraycopy(v, 1 + pointer_v, vTemp, 0, vTemp.length);

        /* w := C * v */
        BLAS.dgemv("No transpose", m, n, c_b4, cTemp, ldc, vTemp, incv, c_b5, workTemp, 1); //$NON-NLS-1$

        BLAS.dger(m, n, tau.unaryMinus(), workTemp, 1, vTemp, incv, cTemp, ldc);
        System.arraycopy(cTemp, 0, c, c_offset + pointer_c, cTemp.length);
        System.arraycopy(workTemp, 0, this.work, 1 + pointer_work, workTemp.length);
        System.arraycopy(vTemp, 0, v, 1 + pointer_v, vTemp.length);
      }
    }

    return 0;
  }

  /*  -- LAPACK routine (version 3.0) --   
  Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
  Courant Institute, Argonne National Lab, and Rice University   
  June 30, 1999   


  Purpose   
  =======   

  DORGQR generates an M-by-N real matrix Q with orthonormal columns,   
  which is defined as the first N columns of a product of K elementary   
  reflectors of order M   

     Q  =  H(1) H(2) . . . H(k)   

  as returned by DGEQRF.   

  Arguments   
  =========   

  M       (input) INTEGER   
       The number of rows of the matrix Q. M >= 0.   

  N       (input) INTEGER   
       The number of columns of the matrix Q. M >= N >= 0.   

  K       (input) INTEGER   
       The number of elementary reflectors whose product defines the   
       matrix Q. N >= K >= 0.   

  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)   
       On entry, the i-th column must contain the vector which   
       defines the elementary reflector H(i), for i = 1,2,...,k, as   
       returned by DGEQRF in the first k columns of its array   
       argument A.   
       On exit, the M-by-N matrix Q.   

  LDA     (input) INTEGER   
       The first dimension of the array A. LDA >= max(1,M).   

  TAU     (input) DOUBLE PRECISION array, dimension (K)   
       TAU(i) must contain the scalar factor of the elementary   
       reflector H(i), as returned by DGEQRF.   

  WORK    (workspace/output) DOUBLE PRECISION array, dimension (LWORK)   
       On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

  LWORK   (input) INTEGER   
       The dimension of the array WORK. LWORK >= max(1,N).   
       For optimum performance LWORK >= N*NB, where NB is the   
       optimal blocksize.   

       If LWORK = -1, then a workspace query is assumed; the routine   
       only calculates the optimal size of the WORK array, returns   
       this value as the first entry of the WORK array, and no error   
       message related to LWORK is issued by XERBLA.   

  INFO    (output) INTEGER   
       = 0:  successful exit   
       < 0:  if INFO = -i, the i-th argument has an illegal value   

  =====================================================================   
  */
  /**
   * @param m m
   * @param n n
   * @param k k
   * @param aIndex aIndex
   * @param lda lda
   * @param tauIndex tauIndex
   * @param workIndex workIndex
   * @param lwork lwork
   * @return result
   */
  private int dorgqr_(int m, int n, int k, int aIndex, int lda, int tauIndex, int workIndex, int lwork) {
    final RS unit = this.a[0].createUnit();

    int a_dim1 = lda;
    int a_offset = 1 + a_dim1 * 1;
    int pointer_a = aIndex - a_offset;
    int pointer_tau = tauIndex - 1;
    int pointer_work = workIndex - 1;

    int nb = Clapack.ilaenv(1, "DORGQR", " ", m, n, k, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
    int lwkopt = Math.max(1, n) * nb;
    this.work[1 + pointer_work] = unit.create(lwkopt);
    boolean lquery = lwork == -1;

    int info = 0;
    if (m < 0) {
      info = -1;
    } else if (n < 0 || n > m) {
      info = -2;
    } else if (k < 0 || k > n) {
      info = -3;
    } else if (lda < Math.max(1, m)) {
      info = -5;
    } else if (lwork < Math.max(1, n) && !lquery) {
      info = -8;
    }
    if (info != 0) {
      BLAS.xerbla("DORGQR", -info); //$NON-NLS-1$
      return info;
    } else if (lquery) {
      return info;
    }

    /* Quick return if possible */
    if (n <= 0) {
      this.work[1 + pointer_work] = unit.create(1);
      return info;
    }

    int ldwork = 0;
    int nbmin = 2;
    int nx = 0;
    int iws = n;
    if (nb > 1 && nb < k) {
      /* Determine when to cross over from blocked to unblocked code.   

      Computing MAX */
      int i2 = Clapack.ilaenv(3, "DORGQR", " ", m, n, k, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
      nx = Math.max(0, i2);
      if (nx < k) {
        /* Determine if workspace is large enough for blocked code. */
        ldwork = n;
        iws = ldwork * nb;
        if (lwork < iws) {
          /* Not enough workspace to use optimal NB:  reduce NB and   
             determine the minimum value of NB. */

          nb = lwork / ldwork;
          /* Computing MAX */
          i2 = Clapack.ilaenv(2, "DORGQR", " ", m, n, k, -1, 6, 1, unit); //$NON-NLS-1$ //$NON-NLS-2$
          nbmin = Math.max(2, i2);
        }
      }
    }

    int ki = 0;
    int kk;
    if (nb >= nbmin && nb < k && nx < k) {
      /* Use blocked code after the last block.   
         The first kk columns are handled by the block method. */

      ki = (k - nx - 1) / nb * nb;
      kk = Math.min(k, ki + nb);

      /* Set A(1:kk,kk+1:n) to zero. */
      for (int j = kk + 1; j <= n; ++j) {
        for (int i = 1; i <= kk; ++i) {
          this.a[j * a_dim1 + i + pointer_a] = unit.create(0);
        }
      }
    } else {
      kk = 0;
    }

    /* Use unblocked code for the last or only block. */
    if (kk < n) {
      info = dorg2r_(m - kk, n - kk, k - kk, (kk + 1) * a_dim1 + kk + 1 + pointer_a, lda, kk + 1 + pointer_tau, 1 + pointer_work);
    }

    if (kk > 0) {
      for (int i = ki + 1; -nb < 0 ? i >= 1 : i <= 1; i += -nb) {
        /* Computing MIN */
        int ib = Math.min(nb, k - i + 1);
        if (i + ib <= n) {
          /* Form the triangular factor of the block reflector   
             H = H(i) H(i+1) . . . H(i+ib-1) */

          dlarft_("Forward", "Columnwise", m - i + 1, ib, i * a_dim1 + i + pointer_a, lda, i + pointer_tau, 1 + pointer_work, ldwork); //$NON-NLS-1$ //$NON-NLS-2$

          /* Apply H to A(i:m,i+ib:n) from the left */
          dlarfb_(
              "Left", "No transpose", "Forward", "Columnwise", m - i + 1, n - i - ib + 1, ib, i * a_dim1 + i + pointer_a, lda, 1 + pointer_work, ldwork, (i + ib) * a_dim1 + i + pointer_a, lda, ib + 1 //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
                  + pointer_work, ldwork);
        }

        /* Apply H to rows i:m of current block */
        info = dorg2r_(m - i + 1, ib, ib, i * a_dim1 + i + pointer_a, lda, i + pointer_tau, 1 + pointer_work);

        /* Set rows 1:i-1 of current block to zero */
        for (int j = i; j <= i + ib - 1; ++j) {
          for (int l = 1; l <= i - 1; ++l) {
            this.a[j * a_dim1 + l + pointer_a] = unit.create(0);
          }
        }
      }
    }

    this.work[1 + pointer_work] = unit.create(iws);
    return info;
  }

  /*  -- LAPACK auxiliary routine (version 3.0) --   
  Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
  Courant Institute, Argonne National Lab, and Rice University   
  February 29, 1992   


  Purpose   
  =======   

  DLARFT forms the triangular factor T of a real block reflector H   
  of order n, which is defined as a product of k elementary reflectors.   

  If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;   

  If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.   

  If STOREV = 'C', the vector which defines the elementary reflector   
  H(i) is stored in the i-th column of the array V, and   

  H  =  I - V * T * V'   

  If STOREV = 'R', the vector which defines the elementary reflector   
  H(i) is stored in the i-th row of the array V, and   

  H  =  I - V' * T * V   

  Arguments   
  =========   

  DIRECT  (input) CHARACTER*1   
       Specifies the order in which the elementary reflectors are   
       multiplied to form the block reflector:   
       = 'F': H = H(1) H(2) . . . H(k) (Forward)   
       = 'B': H = H(k) . . . H(2) H(1) (Backward)   

  STOREV  (input) CHARACTER*1   
       Specifies how the vectors which define the elementary   
       reflectors are stored (see also Further Details):   
       = 'C': columnwise   
       = 'R': rowwise   

  N       (input) INTEGER   
       The order of the block reflector H. N >= 0.   

  K       (input) INTEGER   
       The order of the triangular factor T (= the number of   
       elementary reflectors). K >= 1.   

  V       (input/output) DOUBLE PRECISION array, dimension   
                            (LDV,K) if STOREV = 'C'   
                            (LDV,N) if STOREV = 'R'   
       The matrix V. See further details.   

  LDV     (input) INTEGER   
       The leading dimension of the array V.   
       If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.   

  TAU     (input) DOUBLE PRECISION array, dimension (K)   
       TAU(i) must contain the scalar factor of the elementary   
       reflector H(i).   

  T       (output) DOUBLE PRECISION array, dimension (LDT,K)   
       The k by k triangular factor T of the block reflector.   
       If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is   
       lower triangular. The rest of the array is not used.   

  LDT     (input) INTEGER   
       The leading dimension of the array T. LDT >= K.   

  Further Details   
  ===============   

  The shape of the matrix V and the storage of the vectors which define   
  the H(i) is best illustrated by the following example with n = 5 and   
  k = 3. The elements equal to 1 are not stored; the corresponding   
  array elements are modified but restored on exit. The rest of the   
  array is not used.   

  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':   

            V = (  1       )                 V = (  1 v1 v1 v1 v1 )   
                ( v1  1    )                     (     1 v2 v2 v2 )   
                ( v1 v2  1 )                     (        1 v3 v3 )   
                ( v1 v2 v3 )   
                ( v1 v2 v3 )   

  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':   

            V = ( v1 v2 v3 )                 V = ( v1 v1  1       )   
                ( v1 v2 v3 )                     ( v2 v2 v2  1    )   
                (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )   
                (     1 v3 )   
                (        1 )   

  =====================================================================   
   */
  /**
   * @param direct direct
   * @param storev storev
   * @param n n
   * @param k k
   * @param aIndex aIndex
   * @param ldv ldv
   * @param tauIndex tauIndex
   * @param workIndex workIndex
   * @param ldt ldt
   * @return result
   */
  private int dlarft_(String direct, String storev, int n, int k, int aIndex, int ldv, int tauIndex, int workIndex, int ldt) {
    final RS unit = this.a[0].createUnit();

    RS[] v = this.a;
    RS[] t = this.work;

    RS c_b8 = unit.createZero();

    int v_dim1 = ldv;
    int v_offset = 1 + v_dim1 * 1;
    int pointer_v = aIndex - v_offset;
    int pointer_tau = tauIndex - 1;
    int t_dim1 = ldt;
    int t_offset = 1 + t_dim1 * 1;
    int pointer_t = workIndex - t_offset;

    if (n == 0) {
      return 0;
    }

    if (BLAS.lsame(direct, "F")) { //$NON-NLS-1$
      for (int i = 1; i <= k; ++i) {
        if (this.tau[i + pointer_tau].isZero()) {
          /* H(i)  =  I */
          for (int j = 1; j <= i; ++j) {
            t[i * t_dim1 + j + pointer_t] = unit.create(0);
            /* L10: */
          }
        } else {
          /* general case */
          RS vii = v[i * v_dim1 + i + pointer_v];
          v[i * v_dim1 + i + pointer_v] = unit.create(1);

          if (BLAS.lsame(storev, "C")) { //$NON-NLS-1$
            /* T(1:i-1,i) := - tau(i) * V(i:n,1:i-1)' * V(i:n,i) */
            RS d1 = this.tau[i + pointer_tau].unaryMinus();

            //array copy
            RS[] v1iTemp = unit.createArray(v.length - (v_dim1 + i + pointer_v));
            RS[] viiTemp = unit.createArray(v.length - (i * v_dim1 + i + pointer_v));
            RS[] ti1Temp = unit.createArray(t.length - (i * t_dim1 + 1 + pointer_t));
            System.arraycopy(t, i * t_dim1 + 1 + pointer_t, ti1Temp, 0, ti1Temp.length);
            System.arraycopy(v, v_dim1 + i + pointer_v, v1iTemp, 0, v1iTemp.length);
            System.arraycopy(v, i * v_dim1 + i + pointer_v, viiTemp, 0, viiTemp.length);
            BLAS.dgemv("Transpose", n - i + 1, i - 1, d1, v1iTemp, ldv, viiTemp, 1, c_b8, ti1Temp, 1); //$NON-NLS-1$
            System.arraycopy(ti1Temp, 0, t, i * t_dim1 + 1 + pointer_t, ti1Temp.length);
          } else {
            /* T(1:i-1,i) := - tau(i) * V(1:i-1,i:n) * V(i,i:n)' */
            RS d1 = this.tau[i + pointer_tau].unaryMinus();

            //array copy
            RS[] vi1Temp = unit.createArray(i * v_dim1 + 1 + pointer_v);
            RS[] viiTemp = unit.createArray(i * v_dim1 + i + pointer_v);
            RS[] ti1Temp = unit.createArray(t.length - (i * t_dim1 + 1 + pointer_t));
            System.arraycopy(v, i * v_dim1 + 1 + pointer_v, vi1Temp, 0, vi1Temp.length);
            System.arraycopy(v, i * v_dim1 + i + pointer_v, viiTemp, 0, viiTemp.length);
            System.arraycopy(t, i * t_dim1 + 1 + pointer_t, ti1Temp, 0, ti1Temp.length);
            BLAS.dgemv("No transpose", i - 1, n - i + 1, d1, vi1Temp, ldv, viiTemp, ldv, c_b8, ti1Temp, 1); //$NON-NLS-1$
            System.arraycopy(ti1Temp, 0, t, i * t_dim1 + 1 + pointer_t, ti1Temp.length);
          }
          v[i * v_dim1 + i + pointer_v] = vii;

          //array copy
          RS[] toTemp = unit.createArray(t.length - (t_offset + pointer_t));
          RS[] ti1Temp = unit.createArray(t.length - (i * t_dim1 + 1 + pointer_t));
          System.arraycopy(t, t_offset + pointer_t, ti1Temp, 0, toTemp.length);
          System.arraycopy(t, i * t_dim1 + 1 + pointer_t, ti1Temp, 0, ti1Temp.length);
          BLAS.dtrmv("Upper", "No transpose", "Non-unit", i - 1, toTemp, ldt, ti1Temp, 1); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
          System.arraycopy(ti1Temp, 0, t, i * t_dim1 + 1 + pointer_t, ti1Temp.length);

          t[i * t_dim1 + i + pointer_t] = this.tau[i + pointer_tau];
        }
        /* L20: */
      }
    } else {
      for (int i = k; i >= 1; --i) {
        if (this.tau[i + pointer_tau].isZero()) {
          /* H(i)  =  I */
          for (int j = i; j <= k; ++j) {
            t[i * t_dim1 + j + pointer_t] = unit.createZero();
            /* L30: */
          }
        } else {
          /* general case */
          if (i < k) {
            if (BLAS.lsame(storev, "C")) { //$NON-NLS-1$
              RS vii = v[i * v_dim1 + n - k + i + pointer_v];// v_ref(n - k + i__, i__);
              v[i * v_dim1 + n - k + i + pointer_v] = unit.create(1);

              /* T(i+1:k,i) := - tau(i) * V(1:n-k+i,i+1:k)' * V(1:n-k+i,i) */
              RS d1 = this.tau[i + pointer_tau].unaryMinus();
              //array copy
              RS[] tTemp = unit.createArray(t.length - (i * t_dim1 + i + 1 + pointer_t));
              RS[] viTemp = unit.createArray(v.length - ((i) * v_dim1 + 1 + pointer_v));
              RS[] vi1Temp = unit.createArray(v.length - ((i + 1) * v_dim1 + 1 + pointer_v));
              System.arraycopy(t, i * t_dim1 + i + 1 + pointer_t, tTemp, 0, tTemp.length);
              System.arraycopy(v, (i) * v_dim1 + 1 + pointer_v, viTemp, 0, viTemp.length);
              System.arraycopy(v, (i + 1) * v_dim1 + 1 + pointer_v, vi1Temp, 0, vi1Temp.length);
              BLAS.dgemv("Transpose", n - k + i, k - i, d1, vi1Temp, ldv, viTemp, 1, c_b8, tTemp, 1); //$NON-NLS-1$
              System.arraycopy(tTemp, 0, t, i * t_dim1 + i + 1 + pointer_t, tTemp.length);

              v[i * v_dim1 + n - k + i + pointer_v] = vii;
            } else {
              RS vii = v[(n - k + i) * v_dim1 + i + pointer_v];//v_ref(i__, n - k + i__);
              v[(n - k + i) * v_dim1 + i + pointer_v] = unit.create(1);

              /* T(i+1:k,i) := - tau(i) * V(i+1:k,1:n-k+i) * V(i,1:n-k+i)' */

              RS d1 = this.tau[i + pointer_tau].unaryMinus();

              //array copy
              RS[] tTemp = unit.createArray(t.length - (i * t_dim1 + i + 1 + pointer_t));
              RS[] v1i1Temp = unit.createArray(v.length - (v_dim1 + 1 + i + pointer_v));
              RS[] v1iTemp = unit.createArray(v.length - (v_dim1 + i + pointer_v));
              System.arraycopy(t, i * t_dim1 + i + 1 + pointer_t, tTemp, 0, tTemp.length);
              System.arraycopy(v, v_dim1 + 1 + i + pointer_v, v1i1Temp, 0, v1i1Temp.length);
              System.arraycopy(v, v_dim1 + i + pointer_v, v1iTemp, 0, v1iTemp.length);
              BLAS.dgemv("No transpose", k - i, n - k + i, d1, v1i1Temp, ldv, v1iTemp, ldv, c_b8, tTemp, 1); //$NON-NLS-1$
              System.arraycopy(tTemp, 0, t, i * t_dim1 + i + 1 + pointer_t, tTemp.length);

              v[(n - k + i) * v_dim1 + i + pointer_v] = vii;
            }

            //array copy
            RS[] ti1i1 = unit.createArray(t.length - ((i + 1) * t_dim1 + i + 1 + pointer_t));
            RS[] tii1 = unit.createArray(t.length - (i * t_dim1 + i + 1 + pointer_t));
            System.arraycopy(t, (i + 1) * t_dim1 + i + 1 + pointer_t, ti1i1, 0, ti1i1.length);
            System.arraycopy(t, (i) * t_dim1 + i + 1 + pointer_t, tii1, 0, tii1.length);
            BLAS.dtrmv("Lower", "No transpose", "Non-unit", k - i, ti1i1, ldt, tii1, 1); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
            System.arraycopy(tii1, 0, t, i * t_dim1 + i + 1 + pointer_t, tii1.length);
          }
          t[i * t_dim1 + i + pointer_t] = this.tau[i + pointer_tau];
        }
        /* L40: */
      }
    }

    return 0;
  }

  /**
   * このメソッドはdorgtrのサブルーチンとして組み込んでいるので、
   * 単体で使用する場合はClapackクラスにDlarfb用のメソッドを作ってから使用すること。
   * 
   * @param side side
   * @param trans trans
   * @param direct direct
   * @param storev storev
   * @param m m
   * @param n n
   * @param k k
   * @param vIndex 開始位置(配列aの部分行列)
   * @param ldv ldv
   * @param tIndex 開始位置(配列workの部分行列)
   * @param ldt ldt
   * @param cIndex 開始位置(aの部分行列)
   * @param ldc ldc
   * @param workIndex 開始位置(workの部分行列)
   * @param ldwork2 ldwork2
   */
  private void dlarfb_(String side, String trans, String direct, String storev, int m, int n, int k, int vIndex, int ldv, int tIndex, int ldt, int cIndex, int ldc, int workIndex, int ldwork2) {
    final RS unit = this.a[0].createUnit();

    RS[] vin = unit.createArray(this.a.length - vIndex);
    RS[] tin = unit.createArray(this.work.length - tIndex);
    RS[] cin = unit.createArray(this.a.length - cIndex);
    RS[] workin = unit.createArray(this.work.length - workIndex);

    System.arraycopy(this.a, vIndex, vin, 0, vin.length);
    System.arraycopy(this.work, tIndex, tin, 0, tin.length);
    System.arraycopy(this.a, cIndex, cin, 0, cin.length);
    System.arraycopy(this.work, workIndex, workin, 0, workin.length);

    new Dlarfb<RS,RM,CS,CM>().dlarfb_(side, trans, direct, storev, m, n, k, vin, ldv, tin, ldt, cin, ldc, workin, ldwork2);

    System.arraycopy(cin, 0, this.a, cIndex, cin.length);
  }

  /**
   * @param m m
   * @param n n
   * @param k k
   * @param aIndex aIndex
   * @param lda lda
   * @param tauIndex tauIndex
   * @param workIndex workIndex
   * @return result
   */
  private int dorg2r_(int m, int n, int k, int aIndex, int lda, int tauIndex, int workIndex) {
    final RS unit = this.a[0].createUnit();

    int a_dim1 = lda;
    int a_offset = 1 + a_dim1 * 1;
    int pointer_a = -a_offset;
    int pointer_tau = -1;
    int pointer_work = -1;

    int info = 0;
    if (m < 0) {
      info = -1;
    } else if (n < 0 || n > m) {
      info = -2;
    } else if (k < 0 || k > n) {
      info = -3;
    } else if (lda < Math.max(1, m)) {
      info = -5;
    }
    if (info != 0) {
      BLAS.xerbla("DORG2R", -info); //$NON-NLS-1$
      return info;
    }

    /* Quick return if possible */
    if (n <= 0) {
      return info;
    }

    /* Initialise columns k+1:n to columns of the unit matrix */
    for (int j = k + 1; j <= n; ++j) {
      for (int l = 1; l <= m; ++l) {
        this.a[j * a_dim1 + l + pointer_a] = unit.create(0);
      }
      this.a[j * a_dim1 + j + pointer_a] = unit.create(1);
    }

    for (int i = k; i >= 1; --i) {
      /* Apply H(i) to A(i:m,i:n) from the left */
      if (i < n) {
        this.a[i * a_dim1 + i + pointer_a] = unit.create(1);
        dlarf_("Left", m - i + 1, n - i, i * a_dim1 + i + pointer_a, 1, this.tau[i + pointer_tau], (i + 1) * a_dim1 + i + pointer_a, lda, 1 + pointer_work); //$NON-NLS-1$
      }
      if (i < m) {
        RS d1 = this.tau[i + pointer_tau].unaryMinus();
        RS[] aTemp = unit.createArray(this.a.length - (i * a_dim1 + i + 1 + pointer_a));
        System.arraycopy(this.a, i * a_dim1 + i + 1 + pointer_a, aTemp, 0, aTemp.length);
        BLAS.dscal(m - i, d1, aTemp, 1);
        System.arraycopy(aTemp, 0, this.a, i * a_dim1 + i + 1 + pointer_a, aTemp.length);
      }
      this.a[i * a_dim1 + i + pointer_a] = unit.create(1).subtract(this.tau[i + pointer_tau]);

      /*  Set A(1:i-1,i) to zero */
      for (int l = 1; l <= i - 1; ++l) {
        this.a[i * a_dim1 + l + pointer_a] = unit.create(0);
      }
    }
    return info;
  }
}