\documentstyle[twoside]{article}
\oddsidemargin 0.0in
\evensidemargin 0.0in
\pagestyle{headings}
\topmargin -0.5in
\textheight 9.0in
\textwidth 6.5in
\def \la{\langle}
\def \ra{\rangle}
\def \lp{\left(}
\def \rp{\right)}
\def \lb{\left[}
\def \rb{\right]}
\def \lc{\left\{}
\def \rc{\right\}}
\def \om{\omega}
\def \oom{\frac{1}{\omega}}
\def \ha{\frac{1}{2}}
\def \sA{{\bf A}}
\def \su{{\bf u}}
\def \sb{{\bf b}}
\def \sr{{\bf r}}
\def \pD{D^{(\pi)}}
\def \pC{C^{(\pi)}}
\def \pL{C_L^{(\pi)}}
\def \pU{C_U^{(\pi)}}
\begin{document}
\setcounter{page}{0}
\vspace*{1.0in}
\centerline{\bf NSPCG User's Guide}
\centerline{\bf Version 1.0}
\vspace*{0.24in}
\centerline{A Package for}
\centerline{Solving Large Sparse Linear Systems by}
\centerline{Various Iterative Methods}
\vspace*{0.24in}
\centerline{Thomas C. Oppe}
\centerline{Wayne D. Joubert}
\centerline{David R. Kincaid}
\vspace*{0.24in}
\centerline{April 1988 \hspace*{1.0in} CNA-216}
\vspace*{3.5in}
\noindent
This work was supported in part by the National Science Foundation under
Grant CCR-8518722, the Department of Energy under Grant DE-FG05-87ER25048,
and Cray Research, Inc. under Grant LTR DTD with The University of Texas
at Austin. Thomas C. Oppe's participation was supported in part by the
Control Data Corporation through its PACER Fellowship Program (1985-87)
and by Sandia National Laboratory through Contract No. 06-4298 (1987-88).
\newpage
\setcounter{page}{0}
\title{NSPCG User's Guide
\thanks{ This work was supported in part by the National
Science Foundation under Grant CCR-8518722, the Department
of Energy under Grant DE-FG05-87ER25048, and Cray Research,
Inc. under Grant LTR DTD with The University of Texas at
Austin. Thomas C. Oppe's participation was supported
in part by the Control Data Corporation through its PACER
Fellowship Program (1985-87) and by Sandia National Laboratory
through Contract No. 06-4298 (1987-88). } \\
\large{Version 1.0}}
\author{A Package for \\
Solving Large Sparse Linear Systems by \\
Various Iterative Methods \\ \\
Thomas C. Oppe \\ Wayne D. Joubert \\ David R. Kincaid \\ \\
Center for Numerical Analysis \\
The University of Texas at Austin}
\date{April, 1988}
\maketitle
\begin{abstract}
NSPCG (for Nonsymmetric Preconditioned Conjugate Gradient) is
a computer package to solve the linear system $Au=b$ by various
iterative methods. The coefficient matrix $A$ is assumed to be large and
sparse with real coefficients. A wide selection of preconditioners
and accelerators is available for both symmetric and nonsymmetric
coefficient matrices. Several sparse matrix data structures are
available to represent matrices whose structures range from highly
structured to completely unstructured.
\end{abstract}
\newpage
\centerline{\bf Disclaimer}
\bigskip
The University of Texas at Austin and the Center for Numerical
Analysis (CNA) disclaim all warranties with regard to the NSPCG
software package, including all warranties of merchantability and
fitness, and any stated express warranties are in lieu of all
obligations or liability on the part of The University or the CNA
for damages, including, but not limited to, special, indirect, or
consequential damages arising out of or in connection with the
use or performance of NSPCG. In no event will The University or
the CNA be liable for any direct, indirect, special, incidental, or
consequential damages arising in connection with use of or
inability to use NSPCG or the documentation.
It should be emphasized that the NSPCG package is still in preliminary
and incomplete form. For example,
\begin{itemize}
\item The use of adaptive procedures for $\om$ in the SOR and SSOR
methods assume a symmetric or nearly symmetric matrix. In
the nonsymmetric case, the user must supply a good choice of
$\om$ when using SSOR with any of the nonsymmetric accelerators,
and this value will not change during the iterations.
\item Eigenvalue estimation procedures are available for only certain
of the nonsymmetric accelerators.
\item Not all accelerators allow right-oriented or split preconditioners.
\item Not all preconditioners are available for each of the allowed data
structures in NSPCG.
\end{itemize}
A limited number of copies of the NSPCG package will be made available
on request with the understanding that it is intended as a research tool
and will undergo further development. The interested reader should write
to the address below for additional information on obtaining the
distribution tape of the program. Also, reports of difficulties encountered
in using the system or comments and suggestions for improving the package
would be welcome.
\bigskip
\begin{tabular}{l}
Center for Numerical Analysis \\
RLM Bldg. 13.150 \\
University of Texas at Austin \\
Austin, Texas \ \ 78713-8510
\end{tabular}
\newpage
\tableofcontents
\newpage
\listoftables
\newpage
\section{Introduction}
\label{intro}
\indent
NSPCG is a computer package to solve large sparse systems of linear
equations by iterative methods. The package uses various acceleration
techniques, such as the conjugate gradient method, Chebyshev
acceleration and various generalized conjugate gradient methods for
nonsymmetric systems, in conjunction with various preconditioners
(or, basic iterative methods). NSPCG was developed as part of the ITPACK
project of the Center for Numerical Analysis at The University of Texas
at Austin \cite{itpackv2c,itpack2c,future}.
\smallskip
Some of the salient features of the package are as follows:
\begin{itemize}
\item
Accelerators for the nonsymmetrizable case such as ORTHOMIN,
GCR (Generalized Conjugate Residual), Lanczos, and Paige
and Saunders' LSQR method are provided, as well as accelerators
for the symmetrizable case such as conjugate gradient and
Chebyshev acceleration.
\item
Basic preconditioners such as Jacobi, Incomplete LU
Decomposition, Modified Incomplete LU Decomposition,
Successive Overrelaxation, and Symmetric Successive
Overrelaxation are included in the package. Furthermore,
some of these preconditioners are available either as
left-, right-, or two-sided preconditioners. (See
Section~\ref{bbprecons} for additional details.)
\item
The package is modular. Any preconditioner may be used with
any accelerator, except for SOR which is unaccelerated. Any
preconditioner may be used with any of the allowable data
storage formats, except for block preconditioners which are
available only with the diagonal data structures.
\item
Several matrix storage schemes are available. The user may
represent the matrix in one of several sparse matrix formats:
primary storage (the same format used in the ELLPACK package
\cite{ELLPACK} from Purdue University),
symmetric diagonal storage, nonsymmetric diagonal storage,
symmetric coordinate storage, or nonsymmetric coordinate
storage.
\item
The package can be used in a matrix-free mode, in which the user
supplies customized routines for performing matrix operations.
\item
The data structures have been chosen for efficiency on vector,
or pipelined, computers. Many of the preconditioners have been
vectorized to work efficiently on such computers as the
Cyber 205, Cray 1, and Cray X-MP. It is expected that the
package would perform well on the Hitachi, Fujitsu and NEC
supercomputers.
\end{itemize}
One of the purposes for the development of the NSPCG package
was to investigate the suitability of various basic iterative
methods for vector computers. The degree of vectorization that
is possible for a particular iterative method often depends on the
underlying structure of the matrix, the data structure used for
the matrix representation, the ordering used for the equations,
and the vector computer being used. NSPCG allows several sparse
matrix data structures suitable for matrices with regular or
irregular structures. Also, various orderings can be given to
the equations to enhance vectorization of the basic iterative
method. Finally, the package has been written so that it can
be installed on supercomputers with different vectorizing
philosophies (e.g., memory-to-memory computations as with the
Cyber 205 computer or register-to-register computations as with
the Cray computers).
Another purpose for the development of the NSPCG package was
to provide a common modular structure for research on iterative
methods. In addition to providing a large number of preconditioners
and accelerators, the package has been constructed to facilitate the
addition of new preconditioners and new acceleration schemes
into the package. Thus, the package is an experimental research
tool for evaluating certain aspects of iterative methods and can
provide a guide for the construction of an iterative algorithm
which is tailored to a specific problem. The code is not intended
to be used in production situations, but it can provide information
on
\bigskip
\begin{itemize}
\item the convergence or nonconvergence of an iterative method
\item the number of iterations required for convergence
\item the existence of a preconditioner (e.g., incomplete
factorization preconditioners)
\item the suitability of an approximate stopping test in
reference to an idealized stopping test
\item the effectiveness of certain adaptive procedures for
selecting iterative parameters
\item some vectorization techniques for various iterative
kernels using certain data structures
\end{itemize}
\bigskip
This information can then be used in designing a production
iterative code.
\newpage
\section{Usage}
\label{usage}
\indent
The calling sequence for the package is
\bigskip
\noindent
{\tt CALL NSPCG ($\la$~{\em precon}~$\ra$ ,
$\la$~{\em accel}~$\ra$ ,NDIM,MDIM,N,MAXNZ,COEF,JCOEF,P,IP,U,UBAR,RHS, \\
\hspace*{2.5in} WKSP,IWKSP,NW,INW,IPARM,RPARM,IER) }
\bigskip
\noindent
where
\bigskip
\begin{list}{}{\labelwidth 0.70in
\leftmargin 1.00in \rightmargin 0.25in}
\item[$\la$~{\em precon}~$\ra$ \hfill]
is the name of a preconditioning routine which must
be declared EXTERNAL in the user's calling routine. It
must be one of a certain number of choices given in
Section~\ref{precons}. [name]
\item[$\la$~{\em accel}~$\ra$ \hfill]
is the name of an acceleration routine which must
be declared EXTERNAL in the user's calling routine. It
must be one of a certain number of choices given in
Section~\ref{accels}. [name]
\item[NDIM \hfill]
is the row dimension of the COEF array (See
Section~\ref{storage} on Storage Modes for more information).
[integer, input]
\item[MDIM \hfill]
is the column dimension of the COEF array (See
Section~\ref{storage} on Storage Modes for more information).
[integer, input]
\item[N \hfill]
is the order of the linear system. [integer, input]
\item[MAXNZ \hfill]
is the active column width of the COEF array (See
Section~\ref{storage} on Storage Modes for more information).
[integer, input/output]
\item[COEF \hfill]
is a real array containing the matrix nonzero coefficients
in one of several possible formats to be given in
Section~\ref{storage}. [real, input/output]
\item[JCOEF \hfill]
is an integer array containing auxiliary matrix nonzero
information corresponding to COEF. [integer, input/output]
\item[P \hfill]
is an integer vector. If no permutation is to be applied
to the matrix, then P is unused and may be dimensioned
to be of length one.
If the matrix is to be permuted into another
form, such as red-black, P is a coloring vector of length N
indicating the integer coloring number of each equation.
On output, P is then the permutation vector corresponding
to the multi-color ordering. (See Sections~\ref{rb} and
\ref{mc} for more details.) [integer, input/output]
\item[IP \hfill]
is an integer vector. If no permutation is to be applied
to the matrix, then IP is unused and may be dimensioned
to be of length one.
If the matrix is to be permuted, IP is an integer workspace
vector of length N on input and the inverse permutation
vector corresponding to P on output. (See Sections~\ref{rb}
and \ref{mc} for more details.) [integer, output]
\item[U \hfill]
is a real vector of length N containing the current
estimate of the solution to the linear system on
input. The user must supply an initial guess, which
can be all zeros if no information is known. On
output, U contains the solution estimate computed by
the package. [real, input/output]
\item[UBAR \hfill]
is a real vector of length N containing the exact solution
to the linear system, if known. This vector is used
only if the exact stopping test is used. Otherwise,
it may be dimensioned to be of length one. (See
Section~\ref{stop} for information on the available
stopping tests.) [real, input]
\item[RHS \hfill]
is a real vector of length N containing the right-hand-side
of the matrix problem. [real, input]
\item[WKSP \hfill]
is a real vector of length NW used for workspace.
[real, input]
\item[IWKSP \hfill]
is an integer vector of length INW used for workspace.
[integer, input]
\item[NW \hfill]
is an integer variable indicating the length of the
WKSP vector on input and the amount used on output.
If insufficient real workspace is provided, NW is
set on output to the amount of workspace needed
at the point execution terminated. (See Section~\ref{nw}
for suggested initial values.) [integer, input/output]
\item[INW \hfill]
is an integer variable indicating the length of the
IWKSP vector on input and the amount used on output.
If insufficient integer workspace is provided, INW is
set on output to the amount of workspace needed
at the point execution terminated. (See Section~\ref{nw}
for suggested initial values.) [integer, input/output]
\item[IPARM \hfill]
is an integer vector of length $25$ giving various
integer parameters which are described in Section
~\ref{params}. [integer, input/output]
\item[RPARM \hfill]
is a real vector of length $16$ giving various real
parameters which are described in Section~\ref{params}.
[real, input/output]
\item[IER \hfill]
is an error flag. A nonzero value on output indicates
a fatal or warning error was detected during the
iteration. (See Section~\ref{ier} for a list of error
codes.) [integer, output]
\end{list}
\newpage
\section{Choices for Preconditioner}
\label{precons}
\indent
The naming convention used for $\la$~{\em precon}~$\ra$,
the routine specifying the preconditioner, indicates both the
preconditioner and the data storage format. $\la$~{\em precon}~$\ra$
must be declared EXTERNAL in the user's calling program and has
the form $\la$~{\em name}~$\ra i$ where
\bigskip
\begin{tabular}{rl}
$i = 1$ & for primary storage format \\
$ = 2$ & for symmetric diagonal storage \\
$ = 3$ & for nonsymmetric diagonal storage \\
$ = 4$ & for symmetric coordinate storage \\
$ = 5$ & for nonsymmetric coordinate storage \\
$ = 6$ & for permuted primary storage format \\
$ = 7$ & for permuted diagonal storage (symmetric
or nonsymmetric)
\end{tabular}
\bigskip
\noindent
(See Section~\ref{storage} for a detailed description of these
storage schemes.)
If $i = 1,2,$ or $3$, the parameters P and IP in the calling sequence
are not used and therefore can be dimensioned to be of length one.
If $i=6$ or $7$, P must contain a user-supplied coloring vector
indicating how the equations and unknowns are to be ordered prior
to permuting the system. Preconditioners with names ending with
$4$ or $5$ may be used with or without a permutation.
The available choices for
$\la$~{\em precon}~$\ra$ are:
\bigskip
\begin{tabular}{lll}
RICH$i$ & Richardson's method
& $(i=1,2,3,4,5)$ \\
JAC$i$ & Jacobi method
& $(i=1,2,3,4,5)$ \\
LJAC$i$ & Line Jacobi method
& $(i=2,3)$ \\
LJACX$i$ & Line Jacobi method (approx. inverse)
& $(i=2,3)$ \\
SOR$i$ & Successive Overrelaxation
& $(i=1,2,3,6,7)$ \\
SSOR$i$ & Symmetric SOR
& $(i=1,2,3,6,7)$ \\
IC$i$ & Incomplete Cholesky
& $(i=1,2,3,6)$ \\
& (Note: IC7 = BIC7 or BICX7)
& \\
MIC$i$ & Modified Incomplete Cholesky
& $(i=1,2,3,6)$ \\
& (Note: MIC7 = MBIC7 or MBICX7)
& \\
LSP$i$ & Least Squares Polynomial
& $(i=1,2,3,4,5)$ \\
NEU$i$ & Neumann Polynomial
& $(i=1,2,3,4,5)$ \\
LSOR$i$ & Line SOR
& $(i=2,3)$ \\
LSSOR$i$ & Line SSOR
& $(i=2,3)$ \\
LLSP$i$ & Line Least Squares Polynomial
& $(i=2,3)$ \\
LNEU$i$ & Line Neumann Polynomial
& $(i=2,3)$ \\
BIC$i$ & Block Incomplete Cholesky (ver. 1)
& $(i=2,3,7)$ \\
BICX$i$ & Block Incomplete Cholesky (ver. 2)
& $(i=2,3,7)$ \\
MBIC$i$ & Modified Block Incomplete Cholesky (ver. 1)
& $(i=2,3,7)$ \\
MBICX$i$ & Modified Block Incomplete Cholesky (ver. 2)
& $(i=2,3,7)$ \\
RS$i$ & Reduced System Method
& $(i=6,7)$
\end{tabular}
\bigskip
\noindent
(See Section~\ref{bbprecons} for a brief description of each of
these preconditioners.)
\newpage
\section{Choices for Accelerator}
\label{accels}
\indent
A large collection of accelerators is available to handle both
symmetric and nonsymmetric matrix problems. A list of the
available choices for $\la$~{\em accel}~$\ra$ is given below
with their corresponding common names given in parenthesis.
Any $\la$~{\em precon}~$\ra$ entry can be used with any
$\la$~{\em accel}~$\ra$ entry except as noted. Also, any
accelerator allows left, right, or split orientation of the
preconditioner except as noted.
\bigskip
\begin{list}{}{\labelwidth 0.75in
\leftmargin 1.00in \rightmargin 0.25in}
\item[CG \hfill](Conjugate Gradient acceleration):
This is the two-term form of conjugate gradient for
symmetric and positive definite (SPD) matrices and mildly
nonsymmetric matrices. Only left preconditioning is
allowed.
\item[SI \hfill](Chebyshev acceleration or Semi-Iteration):
This is the two-term form of Chebyshev acceleration for
the same general class of matrices as CG. Only left
preconditioning is allowed.
\item[SOR \hfill](Successive Overrelaxation):
This routine must be used as $\la$~{\em accel}~$\ra$
for the SOR algorithm with adaptive parameter determination.
It is intended for SPD and mildly
nonsymmetric matrices. For this choice of
$\la$~{\em accel}~$\ra$, $\la$~{\em precon}~$\ra$
is restricted to SOR$i$ and LSOR$i$ for allowed values
of $i$. Also, these choices of $\la$~{\em precon}~$\ra$
cannot be used with other accelerations since SOR cannot
be accelerated. Note that Successive Overrelaxation is
not an acceleration but is included here since the
function of the SOR module is analogous to that of an
$\la$~{\em accel}~$\ra$ module.
\item[SRCG \hfill](SSOR CG Algorithm):
This algorithm performs the SSOR conjugate gradient
algorithm with an adaptive procedure for $\om$. It is
intended for SPD and mildly nonsymmetric matrices.
$\la$~{\em precon}~$\ra$ is restricted to SSOR$i$ and
LSSOR$i$ for allowed values of $i$. These selections
for $\la$~{\em precon}~$\ra$ can also be used with other
accelerators, but no adapting on $\om$ will be performed.
Only left preconditioning is allowed. Note that the SRCG
module combines both an accelerator and a preconditioner
but is included here since its function is analogous to
that of an $\la$~{\em accel}~$\ra$ module.
\item[SRSI \hfill](SSOR SI Algorithm):
This algorithm performs the SSOR semi-iteration
algorithm with an adaptive procedure for $\om$. It is
intended for SPD and mildly nonsymmetric matrices.
$\la$~{\em precon}~$\ra$ is restricted to SSOR$i$ and
LSSOR$i$ for allowed values of $i$. Only left
preconditioning is allowed. Note that the SRSI module
combines both an accelerator and a preconditioner but is
included here since its function is analogous to that of
an $\la$~{\em accel}~$\ra$ module.
\item[BASIC \hfill](Basic Iterative Method):
This algorithm runs the unaccelerated iterative method
corresponding to the given preconditioner. The default
extrapolation factor used in BASIC is $2$/(EMAX+EMIN) where
EMAX and EMIN are RPARM variables. To run the unaccelerated
method with no extrapolation, EMAX and EMIN should be
set to $1$. Only left preconditioning is allowed.
Note that the BASIC module is not an accelerator
but is included here since its function is analogous
to that of an $\la$~{\em accel}~$\ra$ module.
\item[ME \hfill](Minimal Error Algorithm):
This is an ORTHODIR-like algorithm for symmetric systems
with a three term recurrence which minimizes
$\|Q_R(u^{(n)}-A^{-1}b)\|_2$, the $2$-norm of the error,
at each iteration \cite{ME}.
\item[CGNR \hfill](Conjugate Gradient applied to the Normal Equations):
This implementation of CG applied to the normal equations
of the preconditioned system minimizes
$\|Q_L^{-1}(b-Au^{(n)})\|_2$, the $2$-norm of the residual
vector of the original preconditioned system, at each iteration
\cite{LSQR}. Only left preconditioning is allowed.
\item[LSQR \hfill](Least Squares Algorithm):
This algorithm calculates the same iterates as CGNR but in
a slightly more stable fashion \cite{LSQR}.
Only left preconditioning is allowed.
\item[ODIR \hfill](ORTHODIR):
This is a truncated/restarted method useful for nonsymmetric
systems of equations \cite{ORTHO}. The user can specify
the number of old vectors used in the truncation and
the restarting frequency.
\item[OMIN \hfill](ORTHOMIN):
This is a common truncated/restarted method used for
nonsymmetric systems \cite{ORTHO}. Note that this method
includes the popular GCR method (Generalized Conjugate
Residual or restarted ORTHOMIN) \cite{GCR}.
\item[ORES \hfill](ORTHORES):
This is another truncated/restarted method for nonsymmetric
systems \cite{ORTHO}.
\item[IOM \hfill](Incomplete Orthogonalization Method):
This is a truncated method due to Saad \cite{IOM1,IOM2}
which calculates the same iterates, in exact arithmetic, as
ORTHORES. In the symmetric case, it runs the SYMMLQ
algorithm of Paige and Saunders \cite{SYMMLQ}.
Only left preconditioning is allowed.
\item[GMRES \hfill](Generalized Minimal Residual Method):
This method is a truncated/restarted method which, in the
case of pure restarting (NS2 $\leq$ NS1 $+1$ where NS1
and NS2 are IPARM quantities), calculates
the same iterates as the truncated/restarted ORTHODIR algorithm
\cite{GMRES}. In the symmetric case, it may be used to run
the MINRES algorithm of Paige and Saunders \cite{SYMMLQ}.
\item[USYMLQ \hfill](Unsymmetric LQ):
This is a quasi-Krylov method useful for nonsymmetric
systems \cite{Yip}. Only left preconditioning is allowed.
\item[USYMQR \hfill](Unsymmetric QR):
This is a companion algorithm to
USYMLQ. It minimizes the $2$-norm of the residual over
an appropriate quasi-Krylov space \cite{Yip}.
Only left preconditioning is allowed.
\item[LANDIR \hfill](Lanczos/ORTHODIR):
This is the first of the three variants of the Lanczos algorithm
for nonsymmetric systems \cite{LANCZOS}.
Only left preconditioning is allowed.
\item[LANMIN \hfill](Lanczos/ORTHOMIN or Biconjugate Gradient Method):
This second variant of the Lanczos algorithm does slightly
less work per iteration than the other variants of the
Lanczos method and is often referred to as the Biconjugate
gradient method \cite{LANCZOS}.
\item[LANRES \hfill](Lanczos/ORTHORES or ``two-sided" Lanczos Method):
This method is the third variant of the Lanczos algorithm
\cite{LANCZOS}. Only left preconditioning is allowed.
\item[CGCR \hfill](Constrained Generalized Conjugate Residual Method):
This method couples truncated/restarted ORTHOMIN with
a constrained residual technique described in
\cite{Wallis1,Wallis2}.
At each iteration, the average residual over each
two-dimensional block is forced to be zero. The
variable NBL2D must be appropriately set for this algorithm.
\item[BCGS \hfill](Biconjugate Gradient Squared Method):
This method is similar to the Biconjugate Gradient Method,
and in many cases performs better \cite{BCGS}. Only left
preconditioning is implemented.
\end{list}
\begin{table}
The permitted $\la$~{\em precon}~$\ra$ and $\la$~{\em accel}~$\ra$
combinations are given in the table below.
\bigskip
\begin{center}
\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}
\hline
&R&J&L&L&S&S&I&M&L&N&L&L&L&L&B&B&M&M&R \\
&I&A&J&J&O&S&C&I&S&E&S&S&L&N&I&I&B&B&S \\
&C&C&A&A&R&O&$i$&C&P&U&O&S&S&E&C&C&I&I&$i$ \\
&H&$i$&C&C&$i$&R& &$i$&$i$&$i$&R&O&P&U&$i$&X&C&C& \\
&$i$& &$i$&X& &$i$& & & & &$i$&R&$i$&$i$& &$i$&$i$&X& \\
& & & &$i$& & & & & & & &$i$& & & & & &$i$& \\ \hline
CG &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
SI &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
SOR & & & & &x& & & & & &x& & & & & & & & \\
SRCG & & & & & &x& & & & & &x& & & & & & & \\
SRSI & & & & & &x& & & & & &x& & & & & & & \\
BASIC &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
ME &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
CGNR &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
LSQR &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
ODIR &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
OMIN &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
ORES &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
IOM &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
GMRES &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
USYMLQ &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
USYMQR &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
LANDIR &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
LANMIN &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
LANRES &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
CGCR &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\
BCGS &x&x&x&x& &x&x&x&x&x& &x&x&x&x&x&x&x&x \\ \hline
\end{tabular}
\caption{Permitted $\la$~{\em precon}~$\ra$ and
$\la$~{\em accel}~$\ra$ Combinations}
\end{center}
\end{table}
\newpage
\section{Brief Background on Preconditioners}
\label{bbprecons}
\indent
The choice of a preconditioner involves the selection of a
matrix $Q$, called the {\it splitting\/} matrix, such that the
preconditioned system
\[ Q^{-1}Au = Q^{-1}b \]
is better conditioned than the original system, $Au = b$. Clearly,
one requirement for $Q$ is that it be easily invertible (i.e., linear
systems having $Q$ as the coefficient matrix can be solved with much
less effort than solving the original system $Au=b$). To reduce the
number of iterations, it is also desirable that $Q$ be ``close" to $A$
in the sense that the spectral radius of $I-Q^{-1}A$ be small. Thus,
in choosing a preconditioner, the user must select between methods
which usually perform a large number of ``cheap" iterations or a
small number of ``expensive" iterations. NSPCG provides a great
number of preconditioning methods to aid the user in selecting a
preconditioner for a production code.
Left preconditioning is illustrated above. Many nonsymmetric
accelerators in the package allow other orientations for the
preconditioner, such as right preconditioning,
\[ (AQ^{-1})(Qu) = b \]
or two-sided preconditioning,
\[ ( Q_L^{-1}AQ_R^{-1}) (Q_Ru) = Q_L^{-1}b \]
in the event that the preconditioner can be split into $Q = Q_LQ_R$ .
The variable IQLR [= IPARM(22)] is set by the user to determine
the orientation of the preconditioner (left, right, or two-sided)
for those accelerators which allow orientations of the preconditioner
other than left preconditioning. Left preconditioning is the
default and is available for all accelerators.
NSPCG supplies preconditioners which can be classified as ``point"
or ``line" preconditioners. Iterative methods are often used in
the solution of large sparse linear systems which arise from the
discretization of partial differential equations using finite
differences or finite elements. The solution to the partial
differential equation is approximated on a discrete set of unknowns
associated with a mesh defined on the domain of the equation. The terms
``point" and ``line" (or ``block") refer to whether the unknowns are
updated one at a time, as with a single node in the mesh, or
several at a time, as with a line of nodes in the mesh.
Let matrix $A$ be written as
\[ \lp \begin{array}{cccc}
A_{11} & A_{12} & \cdots & A_{1n} \\
A_{21} & A_{22} & \cdots & A_{2n} \\
\vdots & \vdots & \ddots & \vdots \\
A_{n1} & A_{n2} & \cdots & A_{nn}
\end{array} \rp \]
If matrix $A$ is viewed as a point matrix, then $n$ is the order of
the system and each $A_{ij}$ is a scalar. In that case, $A$ can be
written as a sum of point matrices
\[ A = D - C_L - C_U \]
where $D$ is a diagonal matrix, $C_L$ is a strictly lower triangular
matrix, and $C_U$ is a strictly upper triangular matrix. If matrix
$A$ is viewed as a block matrix corresponding to a partitioning $\pi$
of the unknowns, then $n$ is the number of groups in the partition
and each $A_{i,j}$ is itself a matrix. In that case, $A$ can be
written as a sum of block matrices
\[ A = \pD - \pL - \pU \]
where $\pD$ is a block diagonal matrix, $\pL$ is a
strictly lower triangular block matrix, and $\pU$ is a
strictly upper triangular block matrix. In NSPCG, the following
restrictions apply to block matrices:
\begin{itemize}
\item If the matrix is not permuted by NSPCG, then block preconditioners
can only be used with symmetric or nonsymmetric diagonal
storage format of the matrix. In that case, each $A_{ii}$
is required to be a dense banded matrix, and all submatrices
must be of equal size. Also, the user must specify the
variable KBLSZ [= IPARM(19)]. KBLSZ is the order of the
1-D block diagonal submatrices $A_{ii}$ which must be
constant for all $i$.
\item If the matrix is permuted by NSPCG, then block preconditioners
can be used with any storage format for the matrix. In that case,
each $A_{ii}$ is required to be a dense banded matrix, and the
submatrices can be of unequal size. In the case of primary
storage format, each $A_{ii}$ is required to be a diagonal
matrix.
\end{itemize}
In the line versions of the preconditioners, a common operation is the
solution of banded subsystems corresponding to the $A_{ii}$ diagonal
block submatrices. NSPCG does not employ a cyclic reduction algorithm
to solve such systems, but will attempt to vectorize the solution if
such a system is actually composed of multiple independent subsystems
of the same size. In this case, the algorithm used is a scalar
forward and back substitution for each individual subsystem, but
with each operation being done on all the subsystems in a vectorizable
way.
\bigskip
\noindent
{\bf POINT PRECONDITIONERS}
\bigskip
\indent
The point preconditioners available in the package are the
following:
\bigskip
\begin{description}
\item[Richardson method (RF):]
For this method, $Q = I$, so, in effect, this represents
the null preconditioner.
\item[Jacobi method:]
For this method, $Q = D$, the diagonal of $A$. If the
matrix is known to have positive diagonal elements,
the Jacobi method may be more efficiently applied by
requesting diagonal scaling of the matrix and applying
the RF method to the system.
\item[Successive Overrelaxation method (SOR):]
For this method,
\[ Q = \oom D-C_L \]
The SOR iteration parameter $\om$ can be determined by an
automatic adaptive procedure. This method allows no
acceleration.
\item[Symmetric Successive Overrelaxation method (SSOR):]
For this method,
\[ Q = \frac{1}{2-\om}
\lp \oom D - C_L\rp \lp \oom D \rp^{-1}\lp \oom D - C_U \rp \]
SSOR iteration parameter $\om$ can be determined by an
automatic adaptive procedure if SSOR is used with either
conjugate gradient or Chebyshev acceleration. Otherwise,
a user-supplied fixed $\om$ is required.
\item[Incomplete $LU$ Decomposition method (ILU(k)):]
This preconditioner uses an incomplete $LU$ decomposition of
the matrix as a preconditioner. The parameter $k$ denotes
the level of fill-in which is to be allowed in the
factorization. The form of $Q$ is:
\[
Q = \lc \begin{array}{ll}
(\Delta - C_L) \Delta^{-1} (\Delta - C_U) & \mbox{if Case I} \\
(\Delta - S) \Delta^{-1} (\Delta - T) & \mbox{if Case II}
\end{array} \right.
\]
Case I occurs when matrix $A$ has Property A (i.e., is 2-cyclic)
and $k=0$. Case II occurs if either condition fails.
Here, $\Delta$ is a diagonal matrix containing the factorization
pivots, $S$ is a strictly lower triangular matrix, and $T$ is
a strictly upper triangular matrix.
It can be seen that if Case I is true, then a considerable
savings in storage is possible since only a vector of pivots
of length N has to be stored. $Q$ can then be implicitly
represented from just $\Delta$ and the given matrix elements,
which are already stored. If Case II is true, then
it is necessary to store $T$ as well (and $S$, if $A$ is
nonsymmetric).
A fill-in level of $k=0$ indicates that no fill-in beyond
the original matrix pattern of nonzeros is to be allowed
in the factorization. For $k=1$, fill-in resulting from
the original nonzero pattern is allowed but no fill-in
resulting from this newly-created fill-in is allowed. In
general, fill-in at level $k$ results from fill-in from levels
$0,1,2, \ldots, k-1.$ As $k$ grows, the number of
iterations should decrease but at the expense of increased
storage and time per iteration.
\item[Modified Incomplete $LU$ Decomposition method (MILU(k)):]
This factorization is similar to the \newline ILU(k) preconditioner
except that the diagonal pivots of the factorization are
adjusted so that $Q-A$ has zero row sums. For many matrices,
this requirement produces a better condition number for
$Q^{-1}A$ than for the ILU(k) preconditioner. Also, this
requirement forces $Q^{-1}A$ to have at least one eigenvalue
equal to one. As in the previous preconditioner, a variable
level of fill-in is allowed.
\item[Neumann Polynomial method:]
For this method, a truncated Neumann series approximation
to $A^{-1}$ is used. The user can specify the degree of the
polynomial to be used. Assuming $A$ is written as $A=D-C$,
where $D$ is the diagonal of $A$ and $C=C_L+C_U$,
then $A=(I-CD^{-1})D$ so
\begin{eqnarray*}
A^{-1} & = & D^{-1}(I-CD^{-1})^{-1} \\
& = & D^{-1}[I+CD^{-1}+(CD^{-1})^2 + \cdots ]
\end{eqnarray*}
A truncated form of this series is then used for $Q^{-1}$.
$Q^{-1}$ is not explicitly computed; $Q^{-1}p$ is
evaluated for a vector $p$ by using a sequence of matrix-vector
multiplications. This method will be effective if the spectral
radius of $CD^{-1}$ is less than one.
\item[Least Squares Polynomial method:]
For this method, a least squares polynomial is used to
approximate the inverse of $A$:
\[ Q^{-1} = p_s(A) \approx A^{-1} \]
Since it is desired that the spectral radius of the iteration
matrix $G=I-Q^{-1}A$ be small, $p_s$ is selected so that the
function $f(x)=1-p_s(x)x$ has minimum $2$-norm over a domain
which contains the spectrum of $A$. Note that $G=f(A)$ is the
iteration matrix. This preconditioner is effective only if
$A$ is SPD (symmetric and positive definite) or nearly so,
in which case, the domain is $[0,\| A \| _\infty]$.
\item[Reduced System method (RS):]
This method first requires that the system $Au=b$
be permuted to a red-black system:
\[
\lp \begin{array}{cc}
D_R & H \\
K & D_B \end{array} \rp
\lp \begin{array}{c}
u_R \\
u_B \end{array} \rp =
\lp \begin{array}{c}
b_R \\
b_B \end{array} \rp
\]
where $D_R$ and $D_B$ are diagonal matrices. This will only
be possible if matrix $A$ has Property A \cite{Young2}. The
black unknowns can be eliminated from the system to yield the
reduced system:
\[
(D_R - H D_B^{-1} K) u_R = b_R - H D_B^{-1} b_B
\]
which becomes the new matrix problem to be solved. Once
$u_R$ has converged to an answer, $u_B$ is found by
\[
u_B = D_B^{-1} ( b_B - K u_R )
\]
The reduced system preconditioner is $D_R$. Note that the
reduced system is not explicitly computed with this
method. However, NSPCG contains a facility for explicitly
computing the reduced system and applying any of the
package preconditioners to it. See Section~\ref{rb}
for more details.
\end{description}
\newpage
\noindent
{\bf LINE PRECONDITIONERS}
\bigskip
\indent
The line preconditioners available in the package are the
following:
\bigskip
\begin{description}
\item[Line Jacobi method:]
For this method, $Q=\pD$, the block diagonal part
of $A$. For many matrices resulting from finite difference
discretizations of partial differential equations,
$\pD$ is a tridiagonal matrix. The solution to the
preconditioning equation $\pD z = r$
is accomplished by a recursive forward and back solve.
If, however, $\pD$ consists of multiple independent
subsystems of size KBLSZ, NSPCG will perform each step
of the factorization and solution process across all
the subsystems in an independent manner. This method
will vectorize on computers allowing constant stride
vector operations.
\item[Line Jacobi (approximate inverse) method:]
This method uses a banded approximation to the inverse of
$\pD$. The solution of $\pD z=r$ is accomplished by
\[
z = \lb \lp \pD\rp^{-1}\rb r
\]
where the $[\cdot ]$ operator indicates a truncation of the
matrix to a banded system. The variable LTRUNC [= IPARM(17)]
is used to determine the half-bandwidth to be used
in the truncation. If $\pD$ is diagonally dominant, then
the diagonals of $\lp \pD \rp ^{-1}$ decay at an exponential
rate the further the diagonal is away from the main diagonal.
Hence for diagonally dominant $\pD$, a banded approximation
to the inverse of $\pD$ will suffice. Thus, the
preconditioning step has been changed from a solve to a
matrix-vector multiply, a vectorizable operation.
\item[Line SOR (LSOR):]
For this method,
\[ Q=\oom \pD-\pL \]
\item[Line SSOR (LSSOR):]
For this method,
\[ Q = \frac{1}{2-\om}
\lp \oom \pD - \pL \rp \lp \oom \pD \rp ^{-1}
\lp \oom \pD - \pU \rp \]
\item[Incomplete Block $LU$ Decomposition, Version 1:]
For this method, $Q$ takes the form:
\[ Q = \lc \begin{array}{ll}
\lp M-\pL \rp M^{-1} \lp M-\pU \rp & \mbox{if Case I} \\
\lp M-S \rp M^{-1} \lp M-T \rp & \mbox{if Case II}
\end{array} \right. \]
Case I occurs when matrix $A$ has block Property A and no
fill-in of blocks is allowed. Case II occurs otherwise.
$M$ here is a block diagonal matrix. Truncated inverses of
diagonal pivot block matrices are used in the construction
of the factorization. We illustrate the factorization
process in the case that $A$ is block tridiagonal:
\[
A = \lp \begin{array}{ccccc}
D_1 & U_1 & & & \\
L_1 & D_2 & U_2 & & \\
& L_2 & \ddots & \ddots & \\
& & \ddots & D_{n-1} & U_{n-1} \\
& & & L_{n-1} & D_n \end{array} \rp
\]
Then $A$ has block Property A, so Case I is used. Thus we seek
a factorization of the form \newline
\mbox{$\lp I-\pL M^{-1} \rp\lp M-\pU \rp$} or
\[ \lp \begin{array}{ccccc}
I & & & & \\
L_1M_1^{-1} & I & & & \\
& L_2M_2^{-1} & I & & \\
& & \ddots & \ddots & \\
& & & L_{n-1}M_{n-1}^{-1}& I
\end{array} \rp \lp \begin{array}{ccccc}
M_1 & U_1 & & & \\
& M_2 & U_2 & & \\
& & M_3 & \ddots & \\
& & & \ddots & U_{n-1} \\
& & & & M_n
\end{array} \rp \]
Thus an exact block factorization satisfying $A=\lp I-\pL M^{-1}\rp
\lp M-\pU\rp$ results in the following recursion formula for
$M_i$:
\[ M_i = D_i - L_{i-1} M_{i-1}^{-1} U_{i-1}
\hspace*{1.0in} (1 \leq i \leq n) \]
Since $M_i^{-1}$ is a full block matrix, truncation to a
banded form is used, resulting in an incomplete block
factorization. Thus,
\[ M_i = D_i - [ L_{i-1} [M_{i-1}^{-1}] U_{i-1} ]
\hspace*{1.0in} (1 \leq i \leq n) \]
where $[\cdot ]$ is a truncation operator to indicate truncation
to a banded form. LTRUNC [= IPARM(17)] is used to determine the
half-bandwidth to be used in the truncations.
\item[Incomplete Block $LU$ Decomposition, Version 2:]
For this method, $Q$ takes the form:
\[
Q = \lc \begin{array}{ll}
\lp M^{-1}-\pL \rp M \lp M^{-1}-\pU \rp & \mbox{if Case I} \\
\lp M^{-1}-S \rp M \lp M^{-1}-T \rp & \mbox{if Case II}
\end{array} \right.
\]
$M$ is a block diagonal matrix and Cases I and II are defined
as above. Truncated inverses of diagonal pivot block matrices
are used in the construction of the factorization. We illustrate
the factorization for a block tridiagonal matrix,
as given above. Thus, Case I applies and a factorization of
the form $\lp I-\pL M \rp\lp M^{-1}-\pU \rp$ is used.
\[ \lp \begin{array}{ccccc}
I & & & & \\
L_1M_1 & I & & & \\
& L_2M_2 & I & & \\
& & \ddots & \ddots & \\
& & & L_{n-1}M_{n-1} & I
\end{array} \rp \lp \begin{array}{ccccc}
M_1^{-1} & U_1 & & & \\
& M_2^{-1} & U_2 & & \\
& & M_3^{-1} & \ddots & \\
& & & \ddots & U_{n-1} \\
& & & & M_n^{-1}
\end{array} \rp \]
Thus an exact block factorization satisfying $A=\lp I-\pL M \rp
\lp M^{-1}-\pU \rp$ results in the following recursion formula for
$M_i$:
\[ M_i = \lp D_i - L_{i-1} M_{i-1} U_{i-1} \rp^{-1}
\hspace*{1.0in} (1 \leq i \leq n) \]
Since $M_i$ is a full block matrix, truncation to a
banded form is used, resulting in an incomplete block
factorization. Thus,
\[ M_i = [(D_i - [ L_{i-1} M_{i-1} U_{i-1} ])^{-1} ]
\hspace*{1.0in} (1 \leq i \leq n) \]
where $[\cdot]$ is the truncation operator.
\item[Modified Incomplete Block $LU$ Decomposition, Version 1:]
For this method, the pivot blocks are modified during the
factorization so that $Q-A$ has zero row sums. The general
form for $Q$ is the same as for the unmodified version.
To force $(Q-A)e=0$ where $e^T=(1,1,\ldots,1)$ (i.e., $Q-A$ has
zero row sums), we set
\[ Q=\lp M-\pL \rp M^{-1}\lp M-\pU \rp \]
where
\[ M_i=D_i-[L_{i-1}[M_{i-1}^{-1}]U_{i-1}]+\Lambda_i
\hspace*{1.0in} (1 \leq i \leq n) \]
and
\[ \Lambda_i = \mbox{diag} \lc [L_{i-1}[M_{i-1}^{-1}]U_{i-1}]e
- L_{i-1}M_{i-1}^{-1}U_{i-1}e \rc \]
\item[Modified Incomplete Block $LU$ Decomposition, Version 2:]
For this method, the pivot blocks are modified during the
factorization so that $Q-A$ has zero row sums. The general
form for $Q$ is the same as for the unmodified version 2.
To force $(Q-A)e = 0$, we set
\[ Q=\lp M^{-1}-\pL \rp M \lp M^{-1}-\pU \rp \]
where
\[ M_i = \tilde{M}_i + N_i \]
\[ \tilde{M}_i = [( D_i-[L_{i-1} M_{i-1} U_{i-1}] )^{-1} ]
\hspace*{1.0in} (1 \leq i \leq n) \]
and $N_{i}$ is a diagonal matrix determined by the equation
\[ N_i c_i = e - \tilde{M}_i c_i \]
where
\[ c_i = D_i e - L_{i-1} M_{i-1} U_{i-1} e \]
\item[Line Neumann Polynomial method:]
This is a line polynomial preconditioners which attempts to
approximate $A^{-1}$ as a polynomial in $\pC\lp\pD\rp^{-1}$
where $\pC=\pL+\pU$ and $A=\pD-\pC$. Then
\[ A = \lb I - \pC (\pD)^{-1}\rb \pD \]
so
\begin{eqnarray*}
A^{-1} & = & (\pD)^{-1}
\lc I - \pC(\pD)^{-1}\rc^{-1} \\
& = & (\pD)^{-1} \lc I + \pC(\pD)^{-1} +
\lb \pC(\pD)^{-1}\rb^2 + \cdots \rc
\end{eqnarray*}
A truncation of this series is then used as the Neumann
polynomial approximation for the inverse of $A$.
\item[Line Least Square Polynomial method:]
For this method,
\[ Q^{-1} = p_s \lb(\pD)^{-1}A \rb \approx A^{-1} \]
for some polynomial of degree $s$. Since it is desired that
the spectral radius of the iteration matrix $G=I-Q^{-1}A$
be small, $p_s$ is selected so that the function $f(x)=1-p_s(x)x$
has minimum $2$-norm over a domain which contains the
spectrum of $(\pD)^{-1}A$.
\item[Line Reduced System method:]
This method requires that matrix $A$ have line Property A
and be given a line red-black ordering. Otherwise,
the details are the same as for point reduced system methods.
Thus, the system $Au=b$ is permuted to
\[ \lp \begin{array}{cc}
\pD_R & H^{(\pi)} \\
K^{(\pi)} & \pD_B \end{array} \rp
\lp \begin{array}{c}
u^{(\pi)}_R \\
u^{(\pi)}_B \end{array} \rp =
\lp \begin{array}{c}
b^{(\pi)}_R \\
b^{(\pi)}_B \end{array} \rp \]
where $\pD_R$ and $\pD_B$ are block diagonal matrices.
\end{description}
\newpage
\section{Brief Background on Accelerators}
\label{bbaccels}
\indent
Each accelerator in the package applies itself to the
preconditioned system
\[ (Q_L^{-1}AQ_R^{-1})(Q_R u) = (Q_L^{-1}b) \]
which we will represent abstractly as
\[ \sA \su = \sb \]
The package contains two classes of accelerators. The first class,
comprising the accelerators CG, SI, SRCG, SRSI and SOR, is best suited
for the symmetrizable case, that is, when the matrices $A$ and
$Q\equiv Q_LQ_R$ are symmetric positive definite (SPD). These symmetric
methods allow only left preconditioning, and they are designed to take
full advantage of the symmetry of $A$ and $Q$.
The second class comprises those accelerators designed to work
with more general problems than the symmetrizable case. These
accelerators in many cases allow for right and two-sided as well
as left preconditioning. They are best understood and will be
discussed here in terms of the system involving the abstract matrix
$\sA$.
Before giving practical usage considerations for the accelerators,
we will give a brief description of the solution technique of each
method. Each acceleration procedure is basically defined by two
criteria:
\begin{enumerate}
\item The solution is to be in some space, typically a shifted
Krylov space, and
\item The solution is selected from this space by some condition,
typically an orthogonality condition.
\end{enumerate}
Here the Krylov space is given by
\[ K_n(v,A)={\rm span}\{ A^i v\} _{i=0}^{n-1}. \]
For certain conditions on $A$, $Q_L$ and $Q_R$, the orthogonality
condition is equivalent to a minimization property over the space.
That is, $u^{(n)}$ is chosen to minimize the quantity
\[ \la u^{(n)}-\bar u,ZQ^{-1}A(u^{(n)}-\bar u)\ra \]
where $\bar u$ denotes the true solution to the original problem $Au=b$,
and $\la\cdot ,\cdot \ra$ denotes the standard inner product. The
matrix $Z$ depends on the method.
\smallskip
The following table gives a summary of the two criteria for
each accelerator. For simplicity, the effects of truncation and
restarting are neglected.
\medskip
\begin{center}
\begin{tabular}{|c|c|c|c|} \hline
Accelerator & Space & Orthogonality Condition & $Z$ \\ \hline
CG, SI & $u^{(n)}\in u^{(0)}+K_n(Q^{-1}r^{(0)},Q^{-1}A)$
& $r^{(n)}\bot K_n(Q^{-1}r^{(0)},Q^{-1}A)$
& $Q$ \\ \hline
ME & $\su^{(n)}\in \su^{(0)}+K_n(\sA\sr^{(0)},\sA)$
& $\sr^{(n)}\bot K_n(\sr^{(0)},\sA)$ ($\sA$ symm.)
& $Q_R^TQ_R A^{-1}Q$ \\ \hline
CGNR,LSQR & $\su^{(n)}\in \su^{(0)}+K_n(\sA^T\sr^{(0)}, \sA^T\sA)$
& $\sA^T\sr^{(n)}\bot K_n(\sA^T\sr^{(0)}, \sA^T\sA)$
& $A^TQ_L^{-T}Q_R$ \\ \hline
ORES,IOM & $\su^{(n)}\in \su^{(0)}+K_n(\sr^{(0)},\sA)$
& $\sr^{(n)}\bot K_n(\sr^{(0)},\sA)$
& $Q_R^TQ_R$ \\ \hline
ODIR,OMIN,GMRES
& $\su^{(n)}\in \su^{(0)}+K_n(\sr^{(0)},\sA)$
& $\sr^{(n)}\bot \sA K_n(\sr^{(0)},\sA)$
& $A^TQ_L^{-T}Q_R$ \\ \hline
USYMLQ & $\su^{(n)}\in \su^{(0)}+\widetilde K_n(\sr^{(0)},\sA^T)$
& $\sr^{(n)}\bot \widetilde K_n(\sr^{(0)},\sA)$
& $Q_R^TQ_R$ \\ \hline
USYMQR & $\su^{(n)}\in \su^{(0)}+\widetilde K_n(\sr^{(0)},\sA^T)$
& $\sr^{(n)}\bot \sA\widetilde K_n(\sr^{(0)},\sA^T)$
& $A^TQ_L^{-T}Q_R$ \\ \hline
LANDIR/MIN/RES
& $\su^{(n)}\in \su^{(0)}+K_n(\sr^{(0)},\sA)$
& $\sr^{(n)}\bot K_n(\sr^{(0)},\sA^T)$
& $Q_R^TQ_R$ \\ \hline
\end{tabular}
\end{center}
\bigskip
\noindent Here the $n$-dimensional quasi-Krylov space is defined by
\[ \widetilde K_n(v,A) = {\rm span} \{ v,Av,AA^Tv,AA^TAv,\ldots \} \]
The residual of the abstract system is given by
$\sr^{(n)}=\sb-\sA\su^{(n)}$, whereas $r^{(n)}=b-Au^{(n)}$ is the residual
of the original system.
A few comments are in order regarding the nonsymmetric accelerators.
ORTHODIR and ORTHOMIN always have a minimization property, for any $A$,
$Q_L$ and $Q_R$. ORTHORES, however, may not. In the case of $\sA$
symmetric, ORTHODIR and ORTHOMIN reduce to the conjugate residual method
for $\sA$, and ORTHORES reduces to the 3-term conjugate gradient
method applied to $\sA$.
The Lanczos methods all utilize the same ``auxiliary vector''
$\tilde\sr^{(0)} = \sr^{(0)}$. Thus when $\sA$ is
SPD they all give the same iterates as the conjugate gradient method
applied to $\sA$.
Now we present guidelines for determining which accelerator to use in
practical situations. If both $A$ and $Q$ are SPD, the
symmetric accelerators listed above may be used. Otherwise we proceed
as follows.
We consider four classes of matrix problem arising from the preconditioned
matrix $\sA$:
\begin{enumerate}
\item $\sA$ is SPD.
\item $\sA$ is symmetric indefinite.
\item $\sA$ is definite (i.e., the symmetric part, $(\sA+\sA^T)/2$,
or its negative, is SPD.
\item The general case.
\end{enumerate}
Unfortunately, when $\sA$ is not symmetric (cases 3. and 4. above)
the choice of best accelerator is not at all clear. However, we will give
some general guidelines below.
\bigskip
\noindent
{\bf The SPD Case:}
\bigskip
This case often arises from $A$ and $Q$ being SPD, in which case
the symmetric accelerators such as CG may be used. Otherwise, one may
use IOM (minimizing the $\sA^{-1/2}$-norm of $\sr^{(n)}$) or ORTHOMIN
or GMRES (minimizing the $2$-norm of $\sr^{(n)}$). These should be used
in their truncated forms, with variable NS2 \mbox{[= IPARM(10)]} set to
ITMAX [= IPARM(2)], and NS1 [= IPARM(9)] set to 1 (ORTHOMIN) or 2
(IOM, GMRES).
\bigskip
\noindent
{\bf The Symmetric Indefinite Case:}
\bigskip
In the symmetric indefinite case, the methods SYMMLQ and MINRES
may be used. SYMMLQ may be run by using truncated IOM with NS1$=2$
and NS2=ITMAX. MINRES may be run by using truncated
GMRES with NS1 and NS2 set the same way. Of the two, the MINRES
algorithm is to be preferred, as it minimizes the $2$-norm of the residual
$\sr^{(n)}$ at each iteration.
\bigskip
\noindent
{\bf The Definite Case:}
\bigskip
If it is known that $\sA$ is definite, ORTHOMIN and GMRES
are guaranteed to converge. These methods minimize the $2$-norm of
the residual $\sr^{(n)}$. The implementation of ORTHOMIN in the
package runs the algorithm ORTHOMIN(NS1), the truncated method,
and restarts it every NS2 iterations. The best settings for NS1 and
NS2 are as follows:
\begin{description}
\item[{\sl The Mildly Nonsymmetric Case.}]
For a mildly nonsymmetric matrix, NS2 should be set to ITMAX, so that
the pure truncated method is run. In this case, NS1$=0$ is the
steepest descent method, NS1$=1$ is the classical $2$-term conjugate
residual method, and progressively larger choices of NS1 handle
nonsymmetry better, but at the expense of greater computation time
and storage. A value of NS1 $\geq 1$ is preferred here.
\item[{\sl The Highly Nonsymmetric Case.}]
If the matrix is highly nonsymmetric, NS1 should be set to ITMAX,
so that the pure restarted method is run. The variable NS2 denotes
the restart frequency, and, again, larger values of NS2 perform better
but require more time and workspace. If a purely restarted method is
used (i.e., NS1 $\geq$ NS2), the GMRES accelerator should be preferred
to ORTHOMIN, as it requires slightly less storage and CPU time and is
slightly more stable numerically.
\end{description}
\bigskip
\noindent
{\bf The General Case:}
\bigskip
For the general nonsymmetric case, the ORTHOMIN and GMRES accelerators
may work but have the potential of breaking down. The package contains
other methods which are sometimes better. These include methods
based on the normal equations, Lanczos methods, and the
Biconjugate Gradient Squared method. Roughly speaking, if the methods
were listed in terms of increasing reliability (and decreasing overall
efficiency), they would be listed in the order: Lanczos-type methods
(including Biconjugate Gradient Squared), ORTHOMIN-type methods,
and normal equations methods.
\begin{description}
\item[{\sl Normal Equations:}]
For the case of a general matrix, methods based on the normal equations
corresponding to $\sA$ (i.e., solving $\sA^T\sA \su=\sA^T\sb$) may
be used. These methods are guaranteed to converge in exact arithmetic
for any nonsingular system whatsoever; however, in practice convergence
is usually slow, and roundoff error may preclude convergence altogether.
The package contains two implementations of conjugate gradients applied
to the normal equations, CGNR and LSQR, the latter of which is more
stable numerically. These both minimize the norm of the residual
$\sr^{(n)}$ at each step.
\item[{\sl Lanczos Methods.}]
Another useful accelerator is the Lanczos (Biconjugate
Gradient) method. This accelerator, making use of the transpose
operator, has a short recurrence, and will converge within N
iterations if the iteration process does not break down. However, the
iterates are not bounded over all choices of initial residual,
and in fact the method may break down
for nonsymmetric matrices. In fact, little is known about the
convergence properties of the Lanczos method. In spite of this,
for some classes of problems the Lanczos method works quite
efficiently. There are three Lanczos variants in the package,
of which the LANMIN variant is to be preferred.
\item[{\sl The Biconjugate Gradient Squared Method.}]
In many cases, the Biconjugate Gradient Squared (BCGS) \newline method
may converge faster than LANMIN. The BCGS method requires two
matrix multiplies per step and does not require the transpose. In
exact arithmetic, it breaks down for roughly the same matrix problems
as LANMIN does. Its faster convergence arises from the fact that it
generates the square of the error-reducing polynomial utilized by
LANMIN, thus in some sense converging twice as fast as LANMIN.
\end{description}
\newpage
\section{Parameter Arrays IPARM and RPARM}
\label{params}
\indent
The user must supply default values for the parameters in IPARM
and RPARM by inserting the line
\begin{verbatim}
CALL DFAULT (IPARM,RPARM)
\end{verbatim}
in the program before the call to NSPCG. The user may then assign
nondefault values to selected quantities in IPARM and RPARM by
inserting the appropriate assignment statements before the call to
the iterative routine.
Important variables in this package which may change adaptively
are EMAX and EMIN (eigenvalue estimates of $Q^{-1}A$), OMEGA
(overrelaxation
parameter for the SOR and SSOR methods), ALPHAB and BETAB (SSOR
parameters), and SPECR (estimate of the spectral radius of the SOR
iteration matrix).
The integer vector IPARM and the real vector RPARM allow the user
to control certain parameters which affect the performance of the
iterative algorithms. Furthermore, these vectors allow the updated
parameters from the automatic adaptive procedures to be communicated
back to the user. The IPARM and RPARM parameters are described
below.
\bigskip
\noindent
{\bf Description of IPARM parameters}
\bigskip
\begin{list}{}{\labelwidth 0.9in
\leftmargin 1.00in \rightmargin 0.25in}
\item[IPARM(1) \hfill](NTEST):
The stopping test number, in the range $1$ to $10$, indicates
which stopping test should be used to terminate the
iteration. (See Section~\ref{stop} for a description
of the available stopping tests.) The SOR accelerator
uses a specialized stopping test, so this parameter is
ignored when SOR is called unless $\mbox{NTEST}=6$, in which
case the exact stopping test is used. \mbox{[Default: $2$]}
\item[IPARM(2) \hfill](ITMAX):
On input, ITMAX is the maximum number of iterations
allowed. On output, ITMAX is the number of iterations
performed. \mbox{[Default: $100$]}
\item[IPARM(3) \hfill](LEVEL):
LEVEL is the output level control switch. Each higher
value provides additional information. \mbox{[Default: $0$]}
\begin{tabular}{ll}
$< 0$ & no output \\
$= 0$ & fatal error messages only (default) \\
$= 1$ & warning messages and minimum output \\
$= 2$ & reasonable summary \\
$= 3$ & parameter values and informative comments \\
$= 4$ & approximate solution after every iteration \\
\end{tabular}
\item[IPARM(4) \hfill](NOUT):
NOUT is the Fortran unit number for output. \mbox{[Default: $6$]}
\item[IPARM(5) \hfill](IDGTS):
IDGTS is the error analysis switch. An analysis of the
final computed solution is made to determine the
accuracy. \mbox{[Default: $0$]}
\begin{tabular}{ll}
$< 0$ & skip error analysis \\
$= 0$ & compute DIGIT1 and DIGIT2 and store in RPARM
(default) \\
$= 1$ & print DIGIT1 and DIGIT2 \\
$= 2$ & print final approximate solution vector \\
$= 3$ & print final approximate residual vector \\
$= 4$ & print both solution and residual vectors
\end{tabular}
If LEVEL is less than $1$, no printing is done. See
explanation of DIGIT1 [= RPARM(7)] and DIGIT2 [= RPARM(8)]
for more details.
\item[IPARM(6) \hfill](MAXADP):
This parameter is the adaptive procedure switch for EMAX.
\mbox{[Default: $1$]}
\begin{tabular}{ll}
$= 0$ & no adapting on EMAX \\
$= 1$ & adapting on EMAX (default)
\end{tabular}
\item[IPARM(7) \hfill](MINADP):
This parameter is the adaptive procedure switch for EMIN.
\mbox{[Default: $1$]}
\begin{tabular}{ll}
$= 0$ & no adapting on EMIN \\
$= 1$ & adapting on EMIN (default)
\end{tabular}
\item[IPARM(8) \hfill](IOMGAD):
This parameter is the adaptive procedure switch for OMEGA.
\mbox{[Default: $1$]}
\begin{tabular}{ll}
$= 0$ & no adapting on OMEGA \\
$= 1$ & adapting on OMEGA (default)
\end{tabular}
\item[IPARM(9) \hfill](NS1):
NS1 is the number of old vectors to be saved for the
truncated acceleration methods. \mbox{[Default: $5$]}
\item[IPARM(10) \hfill](NS2):
NS2 is the frequency of restarting for the restarted
acceleration methods. By default, NS2 is set to a large
value so that restarting is not done. \mbox{[Default: $100000$]}
\item[IPARM(11) \hfill](NS3):
Used only in ORTHOMIN, NS3 denotes the size of the largest
Hessenberg matrix to be used to estimate the eigenvalues;
it should be set to some value such as $40$. \mbox{[Default: $0$]}
\item[IPARM(12) \hfill](NSTORE):
NSTORE indicates the storage mode used. See
Section~\ref{storage} for a description of the storage
modes. \mbox{[Default: $2$]}
\begin{tabular}{ll}
$= 1$ & Primary format \\
$= 2$ & Symmetric diagonal format (default) \\
$= 3$ & Nonsymmetric diagonal format \\
$= 4$ & Symmetric coordinate format \\
$= 5$ & Nonsymmetric coordinate format
\end{tabular}
\item[IPARM(13) \hfill](ISCALE):
ISCALE is a switch indicating whether or not the matrix
should be scaled before iterating and unscaled after
iterating. \mbox{[Default: $0$]}
\begin{tabular}{ll}
$= 0$ & no scaling (default) \\
$= 1$ & scaling
\end{tabular}
If scaling is selected, the matrix is scaled to have a unit
diagonal, and $u$ and $b$ are scaled accordingly. If
NTEST $= 6$, $\bar{u}$ is also scaled.
If $Au=b$ is the system to be scaled, and $D$ is the diagonal
of $A$, the scaled system is
\[
(D^{-\frac{1}{2}}AD^{-\frac{1}{2}})
(D^{\frac{1}{2}}u)=D^{-\frac{1}{2}}b
\]
The diagonal elements of the matrix must be positive if
scaling is requested. Scaling of the system causes an
extra N elements of WKSP to be used, so scaling is not
recommended if storage is a consideration.
\item[IPARM(14) \hfill](IPERM):
IPERM is a switch indicating whether or not the matrix
should be permuted before iterating and unpermuted after
iterating. \mbox{[Default: $0$]}
\begin{tabular}{ll}
$= 0$ & no permuting (default) \\
$= 1$ & permuting
\end{tabular}
If a permutation of the matrix is desired, IPERM must be set
to $1$, and the vector P of the argument list of NSPCG must
contain a coloring vector. See Sections~\ref{rb} and
\ref{mc} for more details on
coloring vectors. If IPERM $= 0$, the vector P is ignored and
can be dimensioned to be of length one. If IPERM $= 1$, a
permutation vector is generated from P and replaces it, while
IP contains the inverse permutation vector on output.
If $Au=b$ is the system to be permuted and P is the
permutation vector, the permuted system is
\[
(PAP^T) (Pu) = Pb
\]
If NTEST $= 6$, $\bar{u}$ is permuted in addition to
$u$ and $b$.
\item[IPARM(15) \hfill](IFACT):
IFACT indicates whether a factorization associated with a
particular preconditioner should be computed for the
current NSPCG call or whether a factorization from a
previous NSPCG call should be used. For some preconditioners,
the factorization time can be a significant percentage of
the iteration time. If one is making a series of calls to
NSPCG (as in a time-dependent or nonlinear application) and
the coefficient matrix is not changing much, it may be
reasonable to amortize the cost of factorization over several
NSPCG calls by using a factor from a previous call for the
current call. The user must call the same preconditioner
for all calls, and the structure of $A$ as indicated by JCOEF
cannot be changing. \mbox{[Default: $1$]}
\begin{tabular}{ll}
$1$ & matrix $A$ is to be factored for the current call
(default) \\
$0$ & matrix $A$ is not to be factored for the current
call (previous factorization used)
\end{tabular}
\item[IPARM(16) \hfill](LVFILL):
LVFILL is the level of point or block fill-in to be allowed
during the factorization. It affects the preconditioners
\begin{tabular}{ll}
IC$i$ & $(i = 1,2,3)$ \\
MIC$i$ & $(i = 1,2,3)$ \\
BIC$i$ & $(i = 2,3)$ \\
BICX$i$ & $(i = 2,3)$ \\
MBIC$i$ & $(i = 2,3)$ \\
MBICX$i$ & $(i = 2,3)$
\end{tabular}
If LVFILL $= 0$, no fill-in
is allowed beyond the original matrix nonzero pattern. If
LVFILL $= 1$, fill-in is allowed caused by the original
nonzero pattern, but no further fill-in caused by these just
filled-in elements is allowed. If LVFILL $= 2$, fill-in is
allowed if it is due to the original pattern or LVFILL $= 1$
filled-in elements.
As an example of point fill-in, suppose a symmetric
matrix has diagonals at distances of $0$, $1$, and $s$.
Then the diagonals of the factorization are:
\[
\begin{array}{cl}
\mbox{LVFILL} & \mbox{Diagonals in factorization} \\
0 & 0,1,s \\
1 & 0,1,s-1,s \\
2 & 0,1,s-2,s-1,s \\
3 & 0,1,2,s-3,s-2,s-1,s \\
4 & 0,1,2,3,s-5,s-4,s-3,s-2,s-1,s
\end{array}
\]
In general, if at LVFILL $= k$, the positive diagonal numbers
are
\[
p_1,p_2, \ldots ,p_m
\]
and the negative diagonal numbers are
\[
q_1,q_2, \ldots ,q_n
\]
then the diagonal numbers at LVFILL $= k+1$ are
\[
p_i + q_j \hspace*{0.5in} (i=1,2, \ldots ,m
\mbox{\hspace*{0.1in} and \hspace*{0.1in}}
j=1,2, \ldots ,n)
\]
In the example above for LVFILL $=0$, then $p_1=1$,
$p_2=s$, $q_1=-1$, and $q_2=-s$. Hence for LVFILL $=1$,
the diagonal numbers are $0,1,-1,(s-1),-(s-1),s,-s$.
For block factorization methods, LVFILL is the level of
block fill-in. For example, if a 7-point finite difference
stencil is used to discretize a partial differential equation
on a 3-dimensional box domain, the resulting matrix can be
regarded as a block pentadiagonal matrix with 1-D blocks
corresponding to the mesh lines. The block band is sparse
and will tend to fill in during a block factorization. If
LVFILL is positive, line blocks which are zero in the original
matrix will be allowed to fill in with a bandwidth equal
the bandwidth of the diagonal pivot blocks, which in turn
are determined by LTRUNC.
In general, an increase in LVFILL results in a more accurate
factorization (and fewer iterations) but at the expense of
increased storage requirements and possibly more total time.
\mbox{[Default: $0$]}
\item[IPARM(17) \hfill](LTRUNC):
LTRUNC determines the truncation bandwidth to be used when
approximating the inverses of matrices with dense banded
matrices. It affects the preconditioners LJACX$i$, BIC$i$,
BICX$i$, MBIC$i$, MBICX$i$ for $i=2,3$. If the band matrix
whose inverse is being approximated has a half-bandwidth of
$s$, $s+\mbox{LTRUNC}$ will be the half-bandwidth
of the approximating inverse. Thus, LTRUNC is the increase
in bandwidth to be used for the inverse approximation over
the bandwidth of the original matrix. In general, an
increase in LTRUNC means a more accurate factorization at
the expense of increased storage. \mbox{[Default: $0$]}
\item[IPARM(18) \hfill](IPROPA):
IPROPA is a flag to indicate whether or not matrix $A$ has
Property A. If a matrix has Property A, it is two-cyclic
and can be permuted to a red-black matrix. Also, a
considerable savings in storage is possible if a factorization
preconditioner is called. IPROPA affects the following
preconditioners:
\begin{tabular}{ll}
IC$i$ & $(i=1,2,3,6)$ \\
MIC$i$ & $(i=1,2,3,6)$ \\
BIC$i$ & $(i=2,3,7)$ \\
BICX$i$ & $(i=2,3,7)$ \\
MBIC$i$ & $(i=2,3,7)$ \\
MBICX$i$ & $(i=2,3,7)$
\end{tabular}
For the first two preconditioners, IPROPA refers to point
Property A. For the last four preconditioners, IPROPA
refers to block Property A (i.e., whether the matrix
considered as a block matrix has Property A). IPROPA
can assume the following values on input:
\begin{tabular}{lp{3.5in}}
$= 0$ & if matrix $A$ does not have Property A \\
$= 1$ & if matrix $A$ has Property A \\
$= 2$ & if it is not known whether or not matrix $A$ has \\
& \hspace{0.1in} Property A; compute if needed (default)
\end{tabular}
If IPROPA $= 2$ and LVFILL $= 0$ on input, and one of the
six methods above is used, it is determined whether or
not the matrix has property A, and IPROPA is reset to
$0$ or $1$ accordingly.
Determining if matrix $A$ has Property A requires $2$N
workspace from IWKSP, so it is advantageous for the
user to inform NSPCG if it is known beforehand whether
or not the matrix has Property A. In general, finite
element matrices do not have point Property A. If a
$5$-point central finite difference stencil is used on
a two-dimensional self-adjoint PDE, or if a $7$-point
central finite difference stencil is used on a three-
dimensional self-adjoint PDE, the resulting matrix has
Property A. \mbox{[Default: $2$]}
\item[IPARM(19) \hfill](KBLSZ):
KBLSZ is the $1$-D block size. It is used in the
line preconditioners. KBLSZ is the largest integer such
that, if matrix $A$ is considered as a block matrix,
the diagonal blocks have dense bands. \mbox{[Default: $-1$]}
\item[IPARM(20) \hfill](NBL2D):
NBL2D is the $2$-D block size. It is used only for the
CGCR acceleration, which is applied to $3$-D problems on
a box domain. \mbox{[Default: $-1$]}
\item[IPARM(21) \hfill](IFCTV):
IFCTV is a switch for indicating whether a scalar or a
vectorized routine is to be used for the incomplete
factorization of a matrix stored in symmetric or
nonsymmetric diagonal storage mode. The vectorized
routine should perform better for matrix factorization
patterns which have Property A. \mbox{[Default: $1$]}
\begin{tabular}{ll}
$0$ & use scalar routine \\
$1$ & use vectorized routine (default) \\
\end{tabular}
\item[IPARM(22) \hfill](IQLR):
IQLR specifies the orientation of the basic preconditioner.
The value of IQLR can be in the range $0$ to $3$.
\mbox{[Default: $1$]}
\begin{tabular}{ll}
$0$ & no basic preconditioner \\
$1$ & left preconditioning (default) \\
$2$ & right preconditioning \\
$3$ & split preconditioning
\end{tabular}
\item[IPARM(23) \hfill](ISYMM):
ISYMM is a symmetry switch for the matrix. It is used
only for the primary format. If the matrix is symmetric,
a considerable savings in storage is possible if a
factorization preconditioner is called. ISYMM can assume
the following values on input:
\begin{tabular}{lp{3.5in}}
$0$ & matrix is symmetric \\
$1$ & matrix is nonsymmetric \\
$2$ & it is unknown if the matrix is symmetric; NSPCG
should determine if the matrix is symmetric or not
(default)
\end{tabular}
If ISYMM $= 2$ and NSTORE $= 1$ on input, ISYMM is set to
$0$ or $1$ on output. \mbox{[Default: $2$]}
\item[IPARM(24) \hfill](IELIM):
IELIM is a switch for effectively removing rows and columns
when the diagonal entry is extremely large compared to the
nonzero off-diagonal entries in that row. See the description
for TOL [= RPARM(15)] for additional details.
\mbox{[Default: $0$]}
\begin{tabular}{ll}
$0$ & test not done \\
$1$ & test done for removal of rows and columns
\end{tabular}
\item[IPARM(25) \hfill](NDEG):
NDEG specifies the degree of the polynomial preconditioner.
\mbox{[Default: $1$]}
\end{list}
\newpage
\bigskip
\noindent
{\bf Description of RPARM parameters}
\bigskip
\begin{list}{}{\labelwidth 0.9in
\leftmargin 1.00in \rightmargin 0.25in}
\item[RPARM(1) \hfill](ZETA):
ZETA is the stopping test value or approximate relative
accuracy desired in the final computer solution. Iteration
terminates when the stopping test is less than ZETA. If
the method does not converge in ITMAX iterations, ZETA is
reset to an estimate of the relative accuracy achieved.
\mbox{[Default: $10^{-6}$]}
\item[RPARM(2) \hfill](EMAX):
EMAX is an eigenvalue estimate of the preconditioned matrix
$Q^{-1}A$. In the SPD case, EMAX is an estimate of the largest
eigenvalue of $Q^{-1}A$. In the nonsymmetric case, EMAX is an
estimate of the $2$-norm of $Q^{-1}A$. EMAX contains on output
a final adapted value if MAXADP~$= 1$ and the acceleration allows
estimation of eigenvalues. \mbox{[Default: $2.0$]}
\item[RPARM(3) \hfill](EMIN):
EMIN is an eigenvalue estimate of the preconditioned matrix
$Q^{-1}A$. In the SPD case, EMIN is an estimate of the smallest
eigenvalue of $Q^{-1}A$. In the nonsymmetric case, EMIN is an
estimate of the $2$-norm of the inverse of $Q^{-1}A$.
EMIN contains on output a final adapted value if
MINADP~$= 1$ and the acceleration allows estimation of
eigenvalues. \mbox{[Default: $1.0$]}
\item[RPARM(4) \hfill](FF):
FF is an adaptive procedure damping factor for the estimation
of OMEGA for the SSOR methods. Its values lie in the interval
$(0,1]$ with $1.0$ causing the most frequent parameter
changes when IOMGAD $= 1$ is specified. \mbox{[Default: $0.75$]}
\item[RPARM(5) \hfill](FFF):
FFF is an adaptive procedure damping factor for changing
EMAX and EMIN in the Chebyshev accelerations (SI and SRSI).
Its values lie in the interval $(0,1]$ with $1.0$ causing the
most frequent parameter changes when MAXADP~$= 1$ and
MINADP~$= 1$ are specified. \mbox{[Default: $0.75$]}
\item[RPARM(6) \hfill](TIMIT):
TIMIT on output is the iteration time in seconds of the
NSPCG call. The iteration time includes the time to perform
all the iterations, including the time to perform the
stopping test. \mbox{[Default: $0.0$]}
\item[RPARM(7) \hfill](DIGIT1):
DIGIT1 is one measure of the approximate number of digits
of accuracy of the solution. DIGIT1 is computed as the
negative of the logarithm base $10$ of the final value
of the stopping test. \mbox{[Default: $0.0$]}
\item[RPARM(8) \hfill](DIGIT2):
DIGIT2 is the approximate number of digits of accuracy
using the estimated relative residual with the final
approximate solution. DIGIT2 is computed as the negative
of the logarithm base $10$ of the ratio of the $2$-norm
of the residual vector and the $2$-norm of the right-hand-side
vector. This estimate is unrelated to the condition
number of the original system and therefore it will not
be accurate if the system is ill-conditioned.
\mbox{[Default: $0.0$]}
Note: DIGIT1 is determined from the actual stopping test
computed on the final iteration, whereas DIGIT2 is based
on the computed residual vector using the final approximate
solution after the algorithm has terminated. If these
values differ greatly, then either the stopping test has
not worked successfully or the original system is
ill-conditioned.
\item[RPARM(9) \hfill](OMEGA):
OMEGA serves two purposes:
\begin{enumerate}
\item
It is the overrelaxation parameter $\om$ for the SOR
and SSOR methods. OMEGA contains on output a final
adapted value if IOMGAD $= 1$ is specified and the
acceleration allows estimation of $\om$ (SOR, SRCG, and
SRSI only). Otherwise, OMEGA is not changed and the
fixed value of OMEGA is used throughout the iterations.
\item
It can be used in the modified incomplete factorization
methods (both point and block) to specify a degree of
modification. In the unmodified incomplete
factorization method, a factor element is discarded
if it results in fill-in outside the prespecified
fill-in region (determined by LVFILL). In the
modified incomplete factorization method, that factor
element is added to the diagonal element of the row
in which it would have caused the fill-in. OMEGA
can be used to indicate that $\om *$(factor element)
should be added to the diagonal element instead, where
$\om$ lies in the interval $[0,1]$. Thus, $\om=1$
corresponds
to full modification and $\om=0$ corresponds to no
modification. This facility is useful, for example,
when the IC (ILU) factorization of a matrix exists,
but the MIC (MILU) factorization does not. Then a
value of $\om$ between $0$ and $1$ can be chosen
with one
of the preconditioners MIC, MBIC, or MBICX to get
a stronger factorization than IC, BIC, or BICX,
respectively.
\end{enumerate}
\mbox{[Default: $1.0$]}
\item[RPARM(10) \hfill](ALPHAB):
ALPHAB is an estimate of the minimum eigenvalue of
$-D^{-1}(C_L+C_U)$ where $A=D-C_L-C_U$ for the
SSOR methods. ALPHAB contains on output a final estimated
value if IOMGAD $= 1$ is specified and the acceleration allows
estimation of ALPHAB (SRCG and SRSI only). ALPHAB only
affects the SSOR and LSSOR preconditioners, and is used in
the adaptive procedure for $\om$. \mbox{[Default: $0.0$]}
\item[RPARM(11) \hfill](BETAB):
BETAB is an estimate of the maximum eigenvalue of
$D^{-1}C_LD^{-1}C_U$ where $A=D-C_L-C_U$ for the SSOR
methods. BETAB contains on output a final estimated value
if IOMGAD $= 1$ is specified and the acceleration allows
estimation of BETAB (SRCG and SRSI only). BETAB only
affects the SSOR and LSSOR preconditioners, and is used in
the $\om$ adaptive procedure. \mbox{[Default: $0.25$]}
\item[RPARM(12) \hfill](SPECR):
SPECR is an estimate of the spectral radius of the SOR
iteration matrix. SPECR contains on output a final
estimated value if IOMGAD $= 1$ is specified and the
acceleration allows estimation of SPECR (SOR only).
SPECR only affects the SOR and LSOR preconditioners, and
is used in the $\om$ adaptive procedure. \mbox{[Default: $0.0$]}
\item[RPARM(13) \hfill](TIMFAC):
TIMFAC on output is the factorization time in seconds
required in the NSPCG call. \mbox{[Default: $0.0$]}
\item[RPARM(14) \hfill](TIMTOT):
TIMTOT on output is the total time in seconds for the
NSPCG call. TIMTOT $=$ TIMFAC $+$ TIMIT $+$ other where
``other" includes scaling and permuting, if requested.
\mbox{[Default: $0.0$]}
\item[RPARM(15) \hfill](TOL):
TOL is a tolerance factor used for eliminating certain
equations when IELIM $= 1$ is selected. In that case,
rows are eliminated for which the ratio of the sum of
the absolute values of the off-diagonal elements to the
absolute value of the diagonal element is small (less
than TOL). This is done by dividing the right-hand-side
entry for that equation by the diagonal entry, setting
the diagonal entry equal to one, and setting the
off-diagonal entries of that row to zero. The off-diagonal
entries of the corresponding column are also set to zero
after correcting the right-hand-side vector. This
procedure is useful for linear systems arising from
finite element discretizations of partial differential
equations in which Dirichlet boundary conditions are
handled by penalty methods (giving the diagonal values
of the corresponding equations extremely large values).
(The installer of this package should set the value of
SRELPR. See comments in the Installation Guide in
Section~\ref{install} for additional details.)
\mbox{[Default: $500*$SRELPR]}
\item[RPARM(16) \hfill](AINF):
AINF is the infinity norm of the matrix $A$ if the LSP
preconditioner is used and the infinity norm of
$\lp\pD\rp^{-1}A$ if the LLSP preconditioner is used.
These preconditioners are only effective if the matrix
is SPD or nearly so. If the user does not overwrite
the default value, zero, the program attempts to calculate
a value for this quantity. \mbox{[Default: $0.0$]}
\end{list}
\newpage
\begin{table}
The default values for the IPARM and RPARM variables are
given in the tables below.
\begin{center}
\begin{tabular}{|l|l|c|} \hline
Position & Name & Default \\ \hline
IPARM(1) & NTEST & $2$ \\ \hline
IPARM(2) & ITMAX & $100$ \\ \hline
IPARM(3) & LEVEL & $0$ \\ \hline
IPARM(4) & NOUT & $6$ \\ \hline
IPARM(5) & IDGTS & $0$ \\ \hline
IPARM(6) & MAXADP & $1$ \\ \hline
IPARM(7) & MINADP & $1$ \\ \hline
IPARM(8) & IOMGAD & $1$ \\ \hline
IPARM(9) & NS1 & $5$ \\ \hline
IPARM(10) & NS2 & $100000$ \\ \hline
IPARM(11) & NS3 & $0$ \\ \hline
IPARM(12) & NSTORE & $2$ \\ \hline
IPARM(13) & ISCALE & $0$ \\ \hline
IPARM(14) & IPERM & $0$ \\ \hline
IPARM(15) & IFACT & $1$ \\ \hline
IPARM(16) & LVFILL & $0$ \\ \hline
IPARM(17) & LTRUNC & $0$ \\ \hline
IPARM(18) & IPROPA & $2$ \\ \hline
IPARM(19) & KBLSZ & $-1$ \\ \hline
IPARM(20) & NBL2D & $-1$ \\ \hline
IPARM(21) & IFCTV & $1$ \\ \hline
IPARM(22) & IQLR & $1$ \\ \hline
IPARM(23) & ISYMM & $2$ \\ \hline
IPARM(24) & IELIM & $0$ \\ \hline
IPARM(25) & NDEG & $1$ \\ \hline
\end{tabular}
\caption{Default Values for IPARM Variables}
\end{center}
\end{table}
\begin{table}
\begin{center}
\begin{tabular}{|l|l|c|} \hline
Position & Name & Default \\ \hline
RPARM(1) & ZETA & $10^{-6}$ \\ \hline
RPARM(2) & EMAX & $2.0$ \\ \hline
RPARM(3) & EMIN & $1.0$ \\ \hline
RPARM(4) & FF & $0.75$ \\ \hline
RPARM(5) & FFF & $0.75$ \\ \hline
RPARM(6) & TIMIT & $0.0$ \\ \hline
RPARM(7) & DIGIT1 & $0.0$ \\ \hline
RPARM(8) & DIGIT2 & $0.0$ \\ \hline
RPARM(9) & OMEGA & $1.0$ \\ \hline
RPARM(10) & ALPHAB & $0.0$ \\ \hline
RPARM(11) & BETAB & $0.25$ \\ \hline
RPARM(12) & SPECR & $0.0$ \\ \hline
RPARM(13) & TIMFAC & $0.0$ \\ \hline
RPARM(14) & TIMTOT & $0.0$ \\ \hline
RPARM(15) & TOL & $500*\mbox{SRELPR}$ \\ \hline
RPARM(16) & AINF & $0.0$ \\ \hline
\end{tabular}
\caption{Default Values for RPARM Variables}
\end{center}
\end{table}
\clearpage
\section{Storage Modes}
\label{storage}
\indent
Many factors must be taken into account when choosing the operator
representation for $A$ in an iterative solution method. An iterative
solver is typically one of many components in an application, and
the application may suggest an appropriate iterative solution method
and representation of the operator. For example, in many finite
element applications, it is convenient to represent the global stiffness
matrix $A$ in an element-based data structure rather than
assembling it explicitly. A suitable iterative method in that case
might be an acceleration without preconditioning since it is then
only necessary to compute matrix-vector products which can be computed
using the element stiffness matrices in an element-by-element approach.
Also, resource limitations of the computer may affect the method of
representing the operator $A$. Storage demands may require that
the matrix be regenerated for every iteration or that the effect
of an application of the operator $A$ on a vector be represented
implicitly. For details in using NSPCG in matrix format-free
mode, see Section~\ref{mffm}.
The properties of the operator may also have a strong bearing
on the choice of operator representation. The most efficient
iterative solution codes exploit knowledge of any special
characteristics of the matrix such as nonzero pattern, symmetry
in structure or coefficients, and the presence or absence of constant
coefficients. For purposes of computational and storage
efficiency, a sparse matrix data structure should normally
be chosen which matches the sparsity pattern of the operator
$A$. In particular, the vectorization and parallelization of
preconditioners depends strongly upon the exploitation of
the sparsity pattern. If, on the other hand, a sparse
matrix data structure is not compatible with the matrix sparsity
pattern, the potential of storing and doing computations
with too many zeros arises. Symmetry in matrix structure or
coefficients may allow economies in storage costs. Finally,
if the operator has constant coefficients, it would be wasteful
in a production code to use a data structure which stores all
the nonzero coefficients of the matrix.
NSPCG currently allows several storage modes for representing
the coefficient matrix. A short description of each follows.
In considering these storage schemes, the user should select
the scheme most convenient for the application. If the matrix has
a diagonal or block structure, then storage by diagonals can be a
very efficient way to represent the matrix. Also, block
preconditioners can be selected with this storage mode, many of which
have excellent convergence properties. In addition, many
computations vectorize with this storage mode, making the package
more efficient on pipelined computers. On the other hand, if the
matrix is unstructured, then storage by diagonals will result in a
great many zeros being stored and computed upon. In this case,
the primary storage format may be more desirable. The primary
format will be efficient if there is a relatively constant number
of nonzeros per equation. If one equation has many more nonzeros
than the rest, then MAXNZ will have to be large enough to
accommodate it, resulting in many zeros being stored for the other
equations. If this is the case, coordinate storage format can be
used.
\begin{description}
\newpage
\item[Primary Format]:
In this storage format, there are two rectangular arrays COEF and
JCOEF,
one real and one integer, which are used to store a representation
of the coefficient matrix. Both arrays are dimensioned NDIM by
MDIM, where NDIM is at least as large as N, the size of the system,
and MDIM is at least as large as MAXNZ, the maximum number of
nonzeros per row in the matrix, with the maximum being taken over
all rows. Each row in COEF will contain the nonzeros of the
corresponding row in the full matrix $A$, and the corresponding
row in JCOEF will contain its respective column numbers.
For example, the matrix
\[
A = \lp \begin{array}{rrrrr}
11 & 0 & 0 & 14 & 15 \\
0 & 22 & 0 & 0 & 0 \\
0 & 0 & 33 & 0 & 0 \\
14 & 0 & 0 & 44 & 45 \\
15 & 0 & 0 & 45 & 55
\end{array} \rp
\]
would be represented in the COEF and JCOEF arrays as
\[
\mbox{COEF} = \lp \begin{array}{rrr}
11 & 14 & 15 \\
22 & 0 & 0 \\
33 & 0 & 0 \\
44 & 14 & 45 \\
55 & 15 & 45
\end{array} \rp
\]
\[
\mbox{JCOEF} = \lp \begin{array}{rrr}
1 & 4 & 5 \\
2 & 0 & 0 \\
3 & 0 & 0 \\
4 & 1 & 5 \\
5 & 1 & 4
\end{array} \rp
\]
There are several remarks which should be made ---
\begin{enumerate}
\item
If a row in matrix $A$ has fewer than MAXNZ nonzeros,
the corresponding rows in COEF and JCOEF should be
padded with zeros.
\item
The nonzero entries in a given row of COEF may appear
in any order. However, if the diagonal element is not
in column one, then NSPCG will place it there without
returning it to its original position on exiting.
\item
Several splittings will attempt to rearrange COEF and
JCOEF in order to make the preconditioning solution
process more efficient and vectorizable. They may
even attempt to expand the matrix within the limit
set by MDIM, increasing MAXNZ in the process. However,
the matrix returned in COEF and JCOEF at output will be
equivalent to the one at input to within roundoff
error. Slight roundoff error in the coefficient
arrays and RHS will result if the user requests that
scaling of the matrix be done.
\item
For this data format, all nonzero matrix entries must be
present even if the matrix is symmetric. It does not
suffice to store just the upper or lower triangle of
the matrix $A$.
\end{enumerate}
\newpage
\item[Symmetric Diagonal Format]:
This storage format uses a real rectangular array and an
integer vector to store a representation of the matrix. COEF
is dimensioned NDIM by MDIM and JCOEF is dimensioned to length
MDIM. NDIM is at least as large as N, the size of the linear
system, and similarly MDIM is at least as large as MAXNZ, the
number of diagonals to be stored. Each column of COEF contains
a diagonal of $A$ and JCOEF contains a nonnegative integer for
each diagonal indicating its distance from the main diagonal.
For example, the main diagonal has a distance of $0$, the first
superdiagonal has a distance of $1$, etc. This storage format may
be used only if matrix $A$ is symmetric. Only the main diagonal
and nonzero diagonals appearing in the upper triangular part of
$A$ are stored.
For example, the matrix
\[
A = \lp \begin{array}{rrrrr}
11 & 12 & 0 & 14 & 0 \\
12 & 22 & 23 & 0 & 25 \\
0 & 23 & 33 & 34 & 0 \\
14 & 0 & 34 & 44 & 45 \\
0 & 25 & 0 & 45 & 55
\end{array} \rp
\]
would be represented in the COEF and JCOEF arrays as
\[
\mbox{COEF} = \lp \begin{array}{rrr}
11 & 12 & 14 \\
22 & 23 & 25 \\
33 & 34 & 0 \\
44 & 45 & 0 \\
55 & 0 & 0
\end{array} \rp
\]
\[
\mbox{JCOEF} = \lp \begin{array}{r}
0 \\
1 \\
3
\end{array} \rp
\]
There are several remarks which should be made ---
\begin{enumerate}
\item
Element $a_{ij}$ of the matrix always appears in row $i$
of the COEF array. Short diagonals must be padded with
zeros at the bottom. In other words, the diagonals are
top-justified.
\item
The diagonals of $A$ may be stored in any order. However,
if the main diagonal of $A$ does not appear in column one
of COEF, then NSPCG will place it there without returning
it to its original position on exiting.
\item
As with the previous format, many preconditioners will
attempt to rearrange COEF and JCOEF and possibly expand
COEF within the limit set by MDIM, changing MAXNZ in the
process.
\end{enumerate}
\newpage
\item[Nonsymmetric Diagonal Format]:
This storage format is similar to the one described above
except that all nonzero diagonals must be stored. If a diagonal
of $A$ is below the main diagonal, it's corresponding entry in JCOEF
is negative. Thus, the first sub-diagonal has a JCOEF entry of
$-1$, the second sub-diagonal has a JCOEF entry of $-2$, etc. This
storage format is intended for nonsymmetric diagonally structured
matrices.
For example, the matrix
\[
A = \lp \begin{array}{rrrrr}
11 & 10 & 0 & 14 & 0 \\
12 & 22 & 21 & 0 & 25 \\
0 & 23 & 33 & 32 & 0 \\
30 & 0 & 34 & 44 & 43 \\
0 & 25 & 0 & 45 & 55
\end{array} \rp
\]
would be represented in the COEF and JCOEF arrays as
\[
\mbox{COEF} = \lp \begin{array}{rrrrr}
11 & 14 & 10 & 0 & 0 \\
22 & 25 & 21 & 12 & 0 \\
33 & 0 & 32 & 23 & 0 \\
44 & 0 & 43 & 34 & 30 \\
55 & 0 & 0 & 45 & 25
\end{array} \rp
\]
\[
\mbox{JCOEF} = \lp \begin{array}{r}
0 \\
3 \\
1 \\
-1 \\
-3
\end{array} \rp
\]
There are several remarks which should be made ---
\begin{enumerate}
\item
Element $a_{ij}$ of the matrix always appears in row $i$ of
the COEF array. Short diagonals should be padded at the
bottom with zeros if they are superdiagonals and should
be padded at the top with zeros if they are subdiagonals.
Thus, superdiagonals are top-justified and subdiagonals
are bottom-justified.
\item
The diagonals of $A$ may be stored in COEF in any order.
However, if the main diagonal of $A$ does not appear in
column one of COEF, then NSPCG will place it there
without returning it to its original position on exiting.
\item
As with the previous format, many preconditioners may
attempt to rearrange COEF and JCOEF and possibly expand
COEF within the limit set by MDIM, changing MAXNZ in the
process.
\end{enumerate}
\newpage
\item[Symmetric Coordinate Format]:
This storage format uses a real vector and an integer array
to store a representation of the matrix. COEF is dimensioned
to length NDIM and JCOEF is dimensioned NDIM by $2$.
COEF contains the nonzero elements of the matrix and JCOEF
contains the corresponding $i$ values in column $1$ and the
corresponding $j$ values in column $2$. Thus if
COEF(k) $= a_{i,j}$, then JCOEF(k,1) $=i$ and JCOEF(k,2) $=j$.
NDIM is at least as large as MAXNZ, the number of nonzeros
stored, and MDIM can be set to anything.
This storage format may be used only if matrix $A$ is symmetric.
Only the main diagonal and nonzero coefficients appearing in the
upper triangular part of $A$ are stored.
For example, the matrix
\[
A = \lp \begin{array}{rrrrr}
11 & 12 & 0 & 14 & 0 \\
12 & 22 & 23 & 0 & 25 \\
0 & 23 & 33 & 34 & 0 \\
14 & 0 & 34 & 44 & 45 \\
0 & 25 & 0 & 45 & 55
\end{array} \rp
\]
would be represented in the COEF and JCOEF arrays as
\[
\mbox{COEF} = (11,22,33,44,55,12,23,34,45,14,25)
\]
\[
\mbox{JCOEF} = \lp \begin{array}{cc}
1 & 1 \\
2 & 2 \\
3 & 3 \\
4 & 4 \\
5 & 5 \\
1 & 2 \\
2 & 3 \\
3 & 4 \\
4 & 5 \\
1 & 4 \\
2 & 5
\end{array} \rp
\]
For this example, N $=5$, MAXNZ $=11$, NDIM $=11$, and MDIM can
be set to anything.
There are several remarks which should be made ---
\begin{enumerate}
\item
All elements of the main diagonal must be stored, even if
some are zeros.
\item
Duplicate entries (entries having the same $i$ and $j$
coordinates) are allowed. NSPCG removes duplicates by
adding their corresponding coefficient values. Thus,
MAXNZ may be reduced upon output.
\item
The nonzeros of $A$ may be stored in any order. However,
NSPCG will rearrange the elements of the COEF and JCOEF
arrays into a convenient order without returning
it to its original position on exiting. In particular,
the diagonal elements of the matrix will be placed into
the first N locations of COEF.
\end{enumerate}
\newpage
\item[Nonsymmetric Coordinate Format]:
This storage format is similar to the one described above
except that all nonzero coefficients must be stored.
This storage format is intended for nonsymmetric matrices.
For example, the matrix
\[
A = \lp \begin{array}{rrrrr}
11 & 10 & 0 & 14 & 0 \\
12 & 22 & 21 & 0 & 25 \\
0 & 23 & 33 & 32 & 0 \\
30 & 0 & 34 & 44 & 43 \\
0 & 25 & 0 & 45 & 55
\end{array} \rp
\]
would be represented in the COEF and JCOEF arrays as
\[
\mbox{COEF} = (11,22,33,44,55,14,25,10,21, \\
32,43,12,23,34,45,30,25)
\]
\[
\mbox{JCOEF} = \lp \begin{array}{cc}
1 & 1 \\
2 & 2 \\
3 & 3 \\
4 & 4 \\
5 & 5 \\
1 & 4 \\
2 & 5 \\
1 & 2 \\
2 & 3 \\
3 & 4 \\
4 & 5 \\
2 & 1 \\
3 & 2 \\
4 & 3 \\
5 & 4 \\
4 & 1 \\
5 & 2
\end{array} \rp
\]
For this example, N $=5$, MAXNZ $=17$, NDIM $=17$, and MDIM can
be set to anything.
The remarks regarding symmetric coordinate storage format apply
for the nonsymmetric format also.
\end{description}
\newpage
\section{Workspace Requirements}
\label{nw}
\indent
In this section, estimated maximum requirements in NSPCG for real
and integer workspace are given. These values should be used initially
for the NW and INW parameters in the NSPCG calling sequence, and
parameters WKSP and IWKSP should be dimensioned to at least NW and
INW, respectively. If insufficient real or integer workspace is
provided, a fatal error occurs with IER set to $-2$ or $-3$ and NW or
INW set to the amount of workspace needed at the point execution
terminated. Thus, an easy way to determine storage requirements is
to run the package repeatedly using the values of NW and INW suggested
by NSPCG from the previous run until enough workspace is supplied.
On the other hand, if sufficient real and integer workspace
is provided, NW and INW are set on output to the amount of real and
integer workspace actually required. Thus, workspace requirements can
be reduced to these levels when rerunning the same problem.
Workspace requirements can often be reduced if the user chooses
certain IPARM quantities carefully. For the symmetric accelerators,
the choice of stopping test, as determined by NTEST [= IPARM(1)],
has no effect on workspace requirements. For the nonsymmetric
accelerators, some stopping tests can increase storage demands by
up to 3N. Thus a stopping test should be chosen which is efficient
in time and memory. See Section~\ref{stop} for information regarding
the selection of stopping tests in the nonsymmetric case.
Also,
2N integer workspace can be saved if the user informs NSPCG whether
or not matrix $A$ has Property A if this property affects a
preconditioner. This is done by setting IPROPA [= IPARM(18)]
appropriately.
In the following formulas, let
\bigskip
\begin{tabular}{ll}
N & be the problem size \\
MAXNZ & be the active column width of the COEF array \\
ITMAX & be IPARM(2) \\
NS1 & be IPARM(9) \\
NS2 & be IPARM(10) \\
LVFILL & be IPARM(16) \\
LTRUNC & be IPARM(17) \\
IPROPA & be IPARM(18) \\
KBLSZ & be IPARM(19) \\
NBL2D & be IPARM(20) \\
ISYMM & be IPARM(23) \\
NV & be $\max (1,\min ({\rm NS1}, {\rm NS2}-1))$ \\
NH & be $2 + \min ({\rm ITMAX}, {\rm NS2})$ \\
NBLK & be N/NBL2D \\
NFACTI & be the number of diagonals in the factorization \\
NCMAX & be the maximum number of nodes of the same color in a \\
& multicolor preconditioner \\
KEYGS & be the gather/scatter switch set in routine DFAULT \\
& (see Section~\ref{install} for details on this switch) \\
DB & be the full bandwidth of $\pD$ where $A=\pD+\pL+\pU$ \\
& in block notation \\
DHB & be the half bandwidth of $\pD$ \\
ABAND & be the full bandwidth of matrix $A$, in 2-D blocks \\
& (e.g., ABAND = 3 for a 3-D 7-point operator on a box)
\end{tabular}
\bigskip
\noindent
Then
\begin{eqnarray*}
{\rm NW} & = & {\rm NWA}_{\la {\em accel} \ra}
+ {\rm NWS}_{\la {\em accel} \ra}
+ {\rm NWP}_{\la {\em precon} \ra}
+ {\rm NWF}_{\la {\em precon} \ra} \\
{\rm INW} & = & {\rm INWP}_{\la {\em precon} \ra}
+ {\rm INWF}_{\la {\em precon} \ra}
\end{eqnarray*}
As noted above, ${\rm NWS}_{\la {\em accel} \ra}=0$ if
$\la$~{\em accel}~$\ra$
is CG, SI, SOR, SRCG, or SRSI, and can be up to a maximum of
$3{\rm N}$ otherwise. A table giving the real workspace required for
each accelerator, ${\rm NWA}_{\la {\em accel} \ra}$, is given below, followed
by a table giving the real and integer workspace required for each
preconditioner, ${\rm NWP}_{\la {\em precon} \ra}$ and
${\rm INWP}_{\la {\em precon} \ra}$. Finally, a table is shown giving
real and integer workspace requirements for factorizations for certain
preconditioners, ${\rm NWF}_{\la {\em precon} \ra}$ and
${\rm INWF}_{\la {\em precon} \ra}$.
\clearpage
\begin{table}
\begin{center}
\begin{tabular}{|l|l|}
\hline
$\la$~{\em accel}~$\ra$ & ${\rm NWA}_{\la {\em accel} \ra}$ \\
\hline
CG & $3{\rm N}+2*{\rm ITMAX}$ \\ \hline
SI & $4{\rm N}$ \\ \hline
SRCG & $3{\rm N}+2*{\rm ITMAX}$ \\ \hline
SRSI & $4{\rm N}$ \\ \hline
SOR & $2{\rm N}$ \\ \hline
BASIC & $2{\rm N}$ \\ \hline
ME & $9{\rm N}$ \\ \hline
CGNR & $4{\rm N}+2*{\rm ITMAX}$ \\ \hline
LSQR & $5{\rm N}$ \\ \hline
ODIR & $(2*{\rm NV}+5)*{\rm N}+{\rm NV}$ \\ \hline
OMIN & $3*{\rm NV}+2+(2*{\rm NV}+6)*{\rm N}+{\rm NH}*
({\rm NV}+2)+{\rm NH}^2 + 2*{\rm NH}$ \\ \hline
ORES & $(2*{\rm NV}+4)*{\rm N}+{\rm NV}+1$ \\ \hline
IOM & $(2*{\rm NS1}+2)*{\rm N}+5*{\rm NS1}+1$ \\ \hline
GMRES & ${\rm NH}*({\rm NV}+3)+{\rm N}*({\rm NV}+3)+2*({\rm NV}+2)^2
+7*{\rm NV}+17+{\rm NH}^2+2*{\rm NH}$ \\
& Add ${\rm N}*({\rm NV}+1)$ if IQLR $=2$ or $3$ \\
& Add ${\rm N}*({\rm NV}+3)+{\rm NV}+1$ if ${\rm NS1}<{\rm NS2}-1$
\\ \hline
USYMLQ & $8{\rm N}+11$ \\ \hline
USYMQR & $8{\rm N}+14$ \\ \hline
LDIR & $8{\rm N}$ \\ \hline
LMIN & $8{\rm N}$ \\ \hline
LRES & $7{\rm N}$ \\ \hline
BCGS & $9{\rm N}$ \\ \hline
CGCR & OMIN workspace plus ${\rm N}+{\rm NBLK}+{\rm NBLK}*{\rm ABAND}$
\\ \hline
\end{tabular}
\caption{Values for ${\rm NWA}_{\la {\em accel} \ra}$ }
\end{center}
\end{table}
\begin{table}
\begin{center}
\begin{tabular}{|l|l|l|} \hline
$\la$~{\em precon}~$\ra$ & ${\rm NWP}_{\la {\em precon} \ra}$
& ${\rm INWP}_{\la {\em precon} \ra}$ \\ \hline
RICH1 & 0, N if KEYGS = 1 & 0 \\ \hline
JAC1 & 0, N if KEYGS = 1 & 0 \\ \hline
SOR1 & N & 0 \\ \hline
SSOR1 & N, $2{\rm N}$ if ISYMM = 1 & 0 \\ \hline
IC1 & N & 0 \\ \hline
MIC1 & N & 0 \\ \hline
LSP1 & $2{\rm N}$, $3{\rm N}$ if KEYGS = 1 & 0 \\ \hline
NEU1 & N, $2{\rm N}$ if KEYGS = 1 & 0 \\ \hline
RICH2 & 0 & 0 \\ \hline
JAC2 & 0 & 0 \\ \hline
LJAC2 & 0 & 0 \\ \hline
LJACX2 & 0 & 0 \\ \hline
SOR2 & 0 & MAXNZ \\ \hline
SSOR2 & 0 & MAXNZ \\ \hline
IC2 & 0 & NFACTI \\ \hline
MIC2 & 0 & NFACTI \\ \hline
LSP2 & $2{\rm N}$ & 0 \\ \hline
NEU2 & N & 0 \\ \hline
\end{tabular}
\caption{Values for ${\rm NWP}_{\la {\em precon} \ra}$ and
${\rm INWP}_{\la {\em precon} \ra}$}
\end{center}
\end{table}
\begin{table}
\begin{center}
\begin{tabular}{|l|l|l|} \hline
$\la$~{\em precon}~$\ra$ & ${\rm NWP}_{\la {\em precon} \ra}$
& ${\rm INWP}_{\la {\em precon} \ra}$ \\ \hline
LSOR2 & 0 & 0 \\ \hline
LSSOR2 & N & 0 \\ \hline
BIC2 & KBLSZ & 0 \\ \hline
MBIC2 & KBLSZ & 0 \\ \hline
BICX2 & KBLSZ & 0 \\ \hline
MBICX2 & KBLSZ & 0 \\ \hline
LLSP2 & $2{\rm N}$ & 0 \\ \hline
LNEU2 & $2{\rm N}$ & 0 \\ \hline
RICH3 & 0 & 0 \\ \hline
JAC3 & 0 & 0 \\ \hline
LJAC3 & 0 & 0 \\ \hline
LJACX3 & 0 & 0 \\ \hline
SOR3 & 0 & MAXNZ \\ \hline
SSOR3 & N & MAXNZ \\ \hline
IC3 & 0 & NFACTI \\ \hline
MIC3 & 0 & NFACTI \\ \hline
LSP3 & $2{\rm N}$ & 0 \\ \hline
NEU3 & N & 0 \\ \hline
LSOR3 & 0 & 0 \\ \hline
LSSOR3 & N & 0 \\ \hline
BIC3 & $2*{\rm KBLSZ}$ & 0 \\ \hline
MBIC3 & $2*{\rm KBLSZ}$ & 0 \\ \hline
BICX3 & $2*{\rm KBLSZ}$ & 0 \\ \hline
MBICX3 & $2*{\rm KBLSZ}$ & 0 \\ \hline
LLSP3 & $2{\rm N}$ & 0 \\ \hline
LNEU3 & $2{\rm N}$ & 0 \\ \hline
RICH4 & 0, $2{\rm N}$ if KEYGS = 1 & 0 \\ \hline
JAC4 & 0, $2{\rm N}$ if KEYGS = 1 & 0 \\ \hline
LSP4 & $2{\rm N}$, $4{\rm N}$ if KEYGS = 1 & 0 \\ \hline
NEU4 & N, $3{\rm N}$ if KEYGS = 1 & 0 \\ \hline
RICH5 & 0, $2{\rm N}$ if KEYGS = 1 & 0 \\ \hline
JAC5 & 0, $2{\rm N}$ if KEYGS = 1 & 0 \\ \hline
LSP5 & $2{\rm N}$, $4{\rm N}$ if KEYGS = 1 & 0 \\ \hline
NEU5 & N, $3{\rm N}$ if KEYGS = 1 & 0 \\ \hline
SOR6 & N & 0 \\ \hline
SSOR6 & ${\rm N}+{\rm NCMAX}$ & 0 \\ \hline
IC6 & N & 0 \\ \hline
MIC6 & N & 0 \\ \hline
RS6 & $2{\rm N}$ & 0 \\ \hline
SOR7 & 0 & 0 \\ \hline
SSOR7 & N & 0 \\ \hline
BIC7 & $2*{\rm NCMAX}$ & 0 \\ \hline
MBIC7 & $2*{\rm NCMAX}$ & 0 \\ \hline
BICX7 & $2*{\rm NCMAX}$ & 0 \\ \hline
MBICX7 & $2*{\rm NCMAX}$ & 0 \\ \hline
RS7 & N & 0 \\ \hline
\end{tabular}
\caption{Values for ${\rm NWP}_{\la {\em precon} \ra}$ and
${\rm INWP}_{\la {\em precon} \ra}$, cont.}
\end{center}
\end{table}
\newpage
\begin{table}
\begin{center}
\begin{tabular}{|l|l|l|l|} \hline
$\la$~{\em precon}~$\ra$ & Case & ${\rm NWF}_{\la {\em precon} \ra}$
& ${\rm INWF}_{\la {\em precon} \ra}$ \\ \hline
IC1, & IPROPA = 1, LVFILL = 0 & N & 0 \\
MIC1 & IPROPA = 0, LVFILL = 0, ISYMM = 0
& $\ha{\rm N}*{\rm MAXNZ}$ & 0 \\
& IPROPA = 0, LVFILL = 0, ISYMM = 1
& ${\rm N}*{\rm MAXNZ}$ & 0 \\
& LVFILL $>$ 0, ISYMM = 0
& $>\ha{\rm N}*{\rm MAXNZ}$ & $>\ha{\rm N}*{\rm MAXNZ}$ \\
& LVFILL $>$ 0, ISYMM = 1
& $>{\rm N}*{\rm MAXNZ}$ & $>{\rm N}*{\rm MAXNZ}$ \\
& IPROPA = 2 & 0 & $\max (2{\rm N},{\rm above})$ \\ \hline
LJAC2 & all cases & ${\rm N}*{\rm DHB}$ & 0 \\ \hline
LJACX2 & all cases & ${\rm N}*({\rm DHB}+{\rm LTRUNC})$ & 0 \\ \hline
IC2, & IPROPA = 1, LVFILL = 0 & N & ${\rm MAXNZ}+{\rm MAXNZ}^2$ \\
MIC2 & IPROPA = 0, LVFILL = 0
& ${\rm N}*{\rm MAXNZ}$ & ${\rm MAXNZ}+{\rm MAXNZ}^2$ \\
& LVFILL $>$ 0
& $>{\rm N}*{\rm MAXNZ}$ & $>{\rm MAXNZ}^2$ \\
& IPROPA = 2 & 0 & $\max (2{\rm N},{\rm above})$ \\ \hline
LSOR2 & all cases & ${\rm N}*{\rm DHB}$ & $3({\rm MAXNZ}+1)$ \\ \hline
LSSOR2 & all cases & ${\rm N}*{\rm DHB}$ & $3({\rm MAXNZ}+1)$ \\ \hline
BIC2, & IPROPA = 1, LVFILL = 0 & ${\rm N}*({\rm DHB}+{\rm LTRUNC})$
& $3({\rm MAXNZ}+1)$ \\
MBIC2, & IPROPA = 0, LVFILL = 0, LTRUNC = 0
& ${\rm N}*{\rm MAXNZ}$ & $3({\rm MAXNZ}+1)$ \\
BICX2, & otherwise
& $>{\rm N}*{\rm MAXNZ}$ & $3({\rm MAXNZ}+1)$ \\
MBICX2 & IPROPA = 2 & 0 & $\max (2\frac{{\rm N}}{{\rm KBLSZ}},{\rm above})$
\\ \hline
LLSP2 & all cases & ${\rm N}*{\rm DHB}$ & 0 \\ \hline
LNEU2 & all cases & ${\rm N}*{\rm DHB}$ & 0 \\ \hline
LJAC3 & all cases & ${\rm N}*{\rm DB}$ & 0 \\ \hline
LJACX3 & all cases & ${\rm N}*({\rm DB}+2{\rm LTRUNC})$ & 0 \\ \hline
IC3, & IPROPA = 1, LVFILL = 0 & N & ${\rm MAXNZ}+{\rm MAXNZ}^2$ \\
MIC3 & IPROPA = 0, LVFILL = 0
& ${\rm N}*{\rm MAXNZ}$ & ${\rm MAXNZ}+{\rm MAXNZ}^2$ \\
& LVFILL $>$ 0
& $>{\rm N}*{\rm MAXNZ}$ & $>{\rm MAXNZ}^2$ \\
& IPROPA = 2 & 0 & $\max (2{\rm N},{\rm above})$ \\ \hline
LSOR3 & all cases & ${\rm N}*{\rm DB}$ & $3({\rm MAXNZ}+1)$ \\ \hline
LSSOR3 & all cases & ${\rm N}*{\rm DB}$ & $3({\rm MAXNZ}+1)$ \\ \hline
BIC3, & IPROPA = 1, LVFILL = 0 & ${\rm N}*({\rm DB}+2{\rm LTRUNC})$
& $3({\rm MAXNZ}+1)$ \\
MBIC3, & IPROPA = 0, LVFILL = 0, LTRUNC = 0
& ${\rm N}*{\rm MAXNZ}$ & $3({\rm MAXNZ}+1)$ \\
BICX3, & otherwise
& $>{\rm N}*{\rm MAXNZ}$ & $3({\rm MAXNZ}+1)$ \\
MBICX3 & IPROPA = 2 & 0 & $\max (2\frac{{\rm N}}{{\rm KBLSZ}},{\rm above})$
\\ \hline
LLSP3 & all cases & ${\rm N}*{\rm DB}$ & 0 \\ \hline
LNEU3 & all cases & ${\rm N}*{\rm DB}$ & 0 \\ \hline
IC6, & IPROPA = 1, LVFILL = 0 & N & 0 \\
MIC6 & IPROPA = 0, LVFILL = 0
& ${\rm N}*{\rm MAXNZ}$ & 0 \\
& LVFILL $>$ 0 not allowed & & \\
& IPROPA = 2 & 0 & $\max (2{\rm N},{\rm above})$ \\ \hline
SOR7 & all cases & ${\rm N}*{\rm DB}$ & \\ \hline
SSOR7 & all cases & ${\rm N}*{\rm DB}$ & \\ \hline
BIC7, & IPROPA = 1 & ${\rm N}*({\rm DB}+2{\rm LTRUNC})$ & \\
MBIC7, & IPROPA = 0, LTRUNC = 0 & ${\rm N}*{\rm MAXNZ}$ & \\
BICX7, & else & $> {\rm N}*{\rm MAXNZ}$ & \\
MBICX7 & & & \\ \hline
RS7 & all cases & ${\rm N}*{\rm DB}$ & \\ \hline
others & all cases & 0 & 0 \\ \hline
\end{tabular}
\caption{Values for ${\rm NWF}_{\la {\em precon} \ra}$ and
${\rm INWF}_{\la {\em precon} \ra}$}
\end{center}
\end{table}
\clearpage
\newpage
\section{Stopping Tests}
\label{stop}
\indent
The following stopping tests are permitted for the iterative
methods. In each case, the iteration process stops whenever the
given quantity falls below ZETA [= RPARM(1)]. The number given in
parentheses is the corresponding value of NTEST [= IPARM(1)].
A large number of stopping tests is provided for experimentation
by the user. See \cite{NSPCG} for comments regarding the motivation
in using these tests.
\begin{eqnarray}
\frac{\mbox{EMAX}}{\mbox{EMIN}} \lb \frac
{\la r^{(n)},\tilde z^{(n)}\ra }
{\la b,Q^{-1}b \ra} \rb ^\ha
& < & \zeta \\
\frac{1}{\mbox{EMIN}} \lb \frac
{\la \tilde z^{(n)},\tilde z^{(n)}\ra }
{\la u^{(n)},u^{(n)}\ra } \rb ^\ha
& < & \zeta \\
\frac{\mbox{EMAX}}{\mbox{EMIN}} \lb \frac
{\la \tilde z^{(n)},\tilde z^{(n)}\ra}
{\la Q^{-1}b,Q^{-1}b\ra } \rb ^\ha
& < & \zeta \\
\lb \frac{\la \tilde z^{(n)},\tilde z^{(n)}\ra }
{\la Q^{-1}b,Q^{-1}b\ra } \rb ^\ha
& < & \zeta \\
\lb \frac{\la r^{(n)},r^{(n)}\ra}
{\la b,b \ra } \rb ^\ha
& < & \zeta \\
\lb \frac{\la u^{(n)}-\bar{u},u^{(n)}-\bar{u} \ra }
{\la \bar{u},\bar{u} \ra } \rb^\ha
& < & \zeta \\
\frac{\mbox{EMAX}}{\mbox{EMIN}} \lb \frac
{\la r^{(n)},z^{(n)}\ra }
{\la b,Q_L^{-1}b \ra} \rb ^\ha
& < & \zeta \\
\frac{1}{\mbox{EMIN}} \lb \frac
{\la z^{(n)},z^{(n)}\ra }
{\la u^{(n)},u^{(n)}\ra } \rb ^\ha
& < & \zeta \\
\frac{\mbox{EMAX}}{\mbox{EMIN}} \lb \frac
{\la z^{(n)},z^{(n)}\ra}
{\la Q_L^{-1}b,Q_L^{-1}b\ra } \rb ^\ha
& < & \zeta \\
\lb \frac{\la z^{(n)},z^{(n)}\ra }
{\la Q_L^{-1}b,Q_L^{-1}b\ra } \rb ^\ha
& < & \zeta
\end{eqnarray}
\bigskip
Here, EMAX [= RPARM(2)] and EMIN [= RPARM(3)] are estimates of
the $2$-norm of the preconditioned matrix and its inverse. In the
symmetric case, EMAX and EMIN are estimates of the maximum and
minimum eigenvalues of $Q^{-1}A$, respectively. Also,
\bigskip
\begin{tabular}{ll}
& \\
$Q$ & is the preconditioning matrix, \\
& \\
$u^{(n)}$ & is the current iterate, \\
& \\
$r^{(n)}=b-Au^{(n)}$ & is the current residual, \\
& \\
$z^{(n)}=Q_L^{-1}r^{(n)}$ & is the current
left-preconditioned residual, \\
& \\
$\tilde z^{(n)}=Q^{-1}r^{(n)}=Q_R^{-1}Q_L^{-1}r^{(n)}$ & is the current
pseudo-residual, and \\
& \\
$\bar{u}=A^{-1}b$ & is the true solution. \\
&
\end{tabular}
\bigskip
Various accelerators calculate certain vectors or inner products
automatically, allowing the stopping test to be performed very
cheaply. A judicious choice of stopping test is necessary in order
to permit the accelerator to run most efficiently.
Stopping test (6) requires that UBAR contain the solution to
$Au=b$. This stopping test may be used for testing purposes whenever
the exact solution UBAR is known.
\bigskip
\noindent
{\bf Tests for the Symmetric Accelerators}
\bigskip
\indent
The symmetric accelerators CG, SI, SRCG, and SRSI can use any of
the stopping tests efficiently and can adaptively obtain the
EMAX and EMIN estimates if MAXADP [= IPARM(6)] and MINADP [= IPARM(7)]
are set to one, respectively. Hence, stopping tests (1) through (3)
are good choices. Also, the inner product $\la r,\tilde{z} \ra$
is already computed for these accelerators, making stopping test
(1) very cheap to compute. If the user has estimates of EMAX or
EMIN and wishes that these estimates be used in the stopping test,
then RPARM(2) or RPARM(3) should be set to these estimates
before calling NSPCG, and MAXADP or MINADP should be set to zero.
The SOR routine uses a specialized stopping test:
\begin{eqnarray*}
\frac{1}{1-{\rm SPECR}} \lb \frac
{\la u^{(n)}-u^{(n-1)},u^{(n)}-u^{(n-1)}\ra }
{\la u^{(n)},u^{(n)}\ra } \rb
^\ha & < & \zeta
\end{eqnarray*}
\noindent
where SPECR is an estimate of the spectral radius of the SOR matrix.
SPECR is adaptively computed by the SOR accelerator.
This stopping test and stopping test (6) are the only stopping tests
available for the SOR algorithm. Also, the stopping criterion for
SOR is initially set to a large value so that at least one Gauss-Seidel
iteration is performed.
\bigskip
\noindent
{\bf Tests for the Nonsymmetric Accelerators}
\bigskip
\indent
The safest and most economical stopping test for the nonsymmetric
accelerators
is stopping test (10), based on the norm of the left-preconditioned residual.
This calculation can usually be done cheaply. For most accelerators it
requires at most one extra dot product per iteration, and in some cases
(e.g. IOM, GMRES) the quantity is a by-product of the iterative algorithm and
is available at no extra cost.
In some cases it is desirable to use a stopping criterion
which bounds the relative error of the solution to the original problem.
Stopping test (3) is designed to do this. It requires that the
accelerator be able to estimate the extremal eigenvalues of the iteration
matrix; this is presently available for the nonsymmetric accelerators CGNR,
OMIN and CGCR. In the case of OMIN, the variable NS3 is used to denote
the size of the largest Hessenberg matrix to be used to estimate
the eigenvalues; a typical useful value is NS3$=40$.
However, if IQLR = 2 or 3 and a stopping test involving the
right-preconditioned residual is employed (e.g. NTEST=3), the accelerator may
have to do significant extra work to perform the stopping test. In this
case the accelerators OMIN and LANMIN, for instance, require only one inner
product per iteration. However, GMRES takes significant extra work per
iteration if a right preconditioning matrix is present.
\newpage
\section{Using Reduced System Methods}
\label{rb}
\indent
If matrix $A$ has either point or line property A, it can be
permuted to a red-black system:
\[
\lp \begin{array}{cc}
D_R & H \\
K & D_B \end{array} \rp
\lp \begin{array}{c}
u_R \\
u_B \end{array} \rp =
\lp \begin{array}{c}
b_R \\
b_B \end{array} \rp
\]
where $D_R$ and $D_B$ are scalar (block) diagonal matrices if a
point (line) red-black ordering was used. The reduced system
corresponding to this matrix is:
\[
(D_R - H D_B^{-1} K) u_R = b_R - H D_B^{-1} b_B
\]
NSPCG allows two means of using reduced system iterative methods.
\begin{enumerate}
\item
The user can apply the RS preconditioners. With these
preconditioners, the reduced system is not explicitly
computed, and the matrix-vector products using the reduced
system matrix are implicitly calculated using the COEF
matrix. The user must first fill the integer vector P
with a coloring vector for either point or line
red-black ordering. For point red-black ordering, the
REDBLK subroutine can be used to generate the coloring
vector:
\begin{verbatim}
CALL REDBLK (NDIM,N,MAXNZ,COEF,JCOEF,P,IP,NSTORE,IWKSP,IER)
\end{verbatim}
The variable NSTORE takes on the same values as the IPARM
parameter, namely
\begin{tabular}{rl}
NSTORE = $1$ & for primary format \\
= $2$ & for symmetric diagonal format \\
= $3$ & for nonsymmetric diagonal format \\
= $4$ & for symmetric coordinate format \\
= $5$ & for nonsymmetric coordinate format
\end{tabular}
P, IP, and IWKSP are integer workspace vectors of length
N upon input. Upon output, P contains the coloring
vector for red-black ordering if such an ordering exists.
IER on output will be either $0$, in which case a red-
black coloring vector was successfully found, or $-8$,
in which case matrix $A$ does not have point Property A.
For line red-black ordering, the user can use the COLOR
subroutine described in Section~\ref{mc} for generating the
coloring vector P. COLOR can also be used to generate
the coloring vector for point red-black ordering.
\item
If matrix $A$ has point Property A, the user can explicitly
compute the reduced system matrix and use any of the
preconditioners in NSPCG to solve the reduced system.
Storage demands are greater with this method since the
reduced system matrix must be stored. To invoke this
method, the user simply calls the subroutine RSNSP with
the same argument list as NSPCG. The vector P must contain
a coloring vector for point red-black ordering and IPERM
must be set to $1$. The interpretation of the parameters
in the argument list is otherwise the same as for NSPCG.
Note that since P and IP are used to permute the original
system to red-black form, the user cannot apply a further
permutation to the reduced system matrix.
\end{enumerate}
\newpage
\section{Using Multicolor Orderings}
\label{mc}
\indent
For many iterative methods, the unknowns are updated in an order
other than the natural (or lexicographical) ordering. The order
in which the unknowns are updated can be specified by imposing a
coloring on the mesh so that all the unknowns corresponding to nodes
of a particular color are updated before the unknowns corresponding
to nodes having a different color. Typically, a coloring is chosen
so that unknowns (nodes) of the same color are decoupled, thus
allowing these unknowns to be updated simultaneously. This
property makes multicolor iterative methods especially attractive
on vector or parallel computers.
NSPCG allows the matrix $A$ to be reordered and permuted
according to a coloring. The user can request that the matrix $A$
be permuted before iterating by filling vector P in the NSPCG calling
sequence with a coloring vector and setting IPERM [= IPARM(14)] to
one. A coloring is defined as the assignment to each equation of
an integer signifying the number of the color. If the mesh is to be
colored with $p$ colors, vector P contains for each equation
an integer between $1$ and $p$, inclusive. NSPCG will number those
equations labeled with $1$ first, followed by those equations labeled
with $2$, and so on. Equations labeled with $p$ will be numbered
last. NSPCG then permutes the matrix according to the new ordering
of the equations before the iteration begins. After iteration has
terminated, the matrix is permuted to its original form.
If a matrix whose mesh is colored with $p$ colors is permuted
according to the colors, the resulting matrix can be viewed as a
$p$ by $p$ block matrix:
\[ \lp \begin{array}{cccc}
A_{11} & A_{12} & \cdots & A_{1p} \\
A_{21} & A_{22} & \cdots & A_{2p} \\
\vdots & \vdots & \ddots & \vdots \\
A_{p1} & A_{p2} & \cdots & A_{pp}
\end{array} \rp \]
There are some restrictions on the use of permutations:
\begin{itemize}
\item If a permutation is requested with the primary storage
format, the allowable preconditioners are SOR6, SSOR6,
IC6, MIC6, and RS6. In this case, the coloring vector P
must be chosen so that the diagonal block submatrices
$A_{ii}$ are diagonal. Thus, equations having the same
color cannot be dependent. This requirement is made so
that vectorization of these methods is possible. Otherwise,
complicated block solves would be necessary for each
diagonal block instead of a simple vector operation,
and the primary storage format is not informative enough
to vectorize such operations.
\item If a permutation is requested with a diagonal storage
format, the allowable preconditioners are SOR7, SSOR7,
BIC7, BICX7, MBIC7, MBICX7, and RS7. In this case,
the coloring vector P can be chosen so that the $A_{ii}$
submatrices are banded. If they are banded, their
bands must be dense.
\item If a permutation is requested with a diagonal storage
format, extra diagonals are often created. NSPCG will
abort unless the column dimensions of the COEF and JCOEF
arrays are set large enough to store the permuted matrix
by diagonals.
\item If a permutation is requested with a coordinate storage
format, the allowable preconditioners are RICH5, JAC5,
NEU5, and LSP5. There are no restrictions for permutations
used with coordinate storage format, but permutations
are not effective with these preconditioners.
\end{itemize}
When using the NSPCG package, the differences between ``block" and
``multicolor" methods are as follows:
\begin{enumerate}
\item The name of a block preconditioner must end with a $2$ or
a $3$, while the name of a multicolor preconditioner must
end with a $6$ or a $7$.
\item A multicolor preconditioner can only be applied after a
permutation of the matrix (i.e., vector P must be set to a
coloring vector and IPERM [= IPARM(14)] must be set to one).
A block preconditioner can only be applied to a matrix
which is not permuted by NSPCG.
\item With a block method, the diagonal block matrices
$A_{ii}$ must be of equal size, which the user must
specify with KBLSZ [= IPARM(19)]. With a multicolor
method, the $A_{ii}$ are not required to be of equal
size.
\end{enumerate}
As an example of a coloring, suppose the user requests that a $3$
by $3$ mesh with the natural ordering be given a red-black ordering:
\[
\begin{array}{cc}
\begin{array}{ccc}
7 & 8 & 9 \\
4 & 5 & 6 \\
1 & 2 & 3
\end{array} &
\begin{tabular}{ccc}
R & B & R \\
B & R & B \\
R & B & R
\end{tabular} \\
& \\
\mbox{Natural} & \mbox{Red-Black}
\end{array}
\]
If the red unknowns are to be labeled first, then P would be
chosen as
\[
P = ( 1, 2, 1, 2, 1, 2, 1, 2, 1)
\]
resulting in the ordering:
\[
\begin{array}{ccc}
4 & 9 & 5 \\
7 & 3 & 8 \\
1 & 6 & 2
\end{array}
\]
If the user wished the black unknowns to be labeled first, P would
be chosen as
\[
P = ( 2, 1, 2, 1, 2, 1, 2, 1, 2)
\]
resulting in the ordering:
\[
\begin{array}{ccc}
8 & 4 & 9 \\
2 & 7 & 3 \\
5 & 1 & 6
\end{array}
\]
If a preconditioner requires a permutation then the user must set
IPERM [= IPARM(14)] equal to one and fill P with a coloring vector.
There are two facilities in NSPCG for automatically generating coloring
vectors --- REDBLK and COLOR.
\begin{enumerate}
\item
REDBLK determines whether or not matrix $A$ has Property A.
If it does, REDBLK generates a coloring vector for point
red-black ordering. The calling sequence for REDBLK is
\begin{verbatim}
CALL REDBLK (NDIM,N,MAXNZ,COEF,JCOEF,P,IP,NSTORE,IWKSP,IER)
\end{verbatim}
The parameters NDIM, MAXNZ, COEF, JCOEF, N, and NSTORE
have been explained previously. P, IP, and IWKSP are integer
workspace vectors of length N on input. On output, P
contains the coloring vector if IER $=0$. If IER $=-8$,
the matrix does not have Property A, and a red-black
coloring vector does not exist and the user must choose
some other iterative method.
\item
COLOR replicates a rectangular (2-D) or box (3-D) color
pattern throughout a rectangular or box domain. For
example, a red-black pattern on a 2-D rectangular grid
is a replication of the pattern:
\[ \begin{tabular}{cc}
B & R \\
R & B
\end{tabular} \]
The calling sequence for COLOR is
\begin{verbatim}
CALL COLOR (NXP,NYP,NZP,NX,NY,NZ,PATT,P)
\end{verbatim}
NXP, NYP, and NZP are the $x,y,$ and $z$ dimensions of the
pattern, and NX, NY, and NZ are the dimensions of the grid.
For 2-D problems, NZP and NZ are both set to $1$. PATT is an
integer vector of length NXP*NYP*NZP containing the pattern
in the order left-to-right, bottom-to-top. Different colors
are coded as different integers in PATT, starting with one.
P is an integer vector of length NX*NY*NZ which contains
the coloring vector on output. For example, the color
pattern
\[ \begin{tabular}{cc}
B & R \\
R & B
\end{tabular} \]
would be specified with NXP $=2$, NYP $=2$, NZP $=1$, and
PATT $=(1,2,2,1)$. A red-black pattern in three dimensions
whose pattern is given by
\[ \begin{array}{cc}
\begin{tabular}{cc}
B & R \\
R & B
\end{tabular} &
\begin{tabular}{cc}
R & B \\
B & R
\end{tabular} \\
& \\
k=1 & k=2
\end{array} \]
would be specified with NXP $=2$, NYP $=2$, NZP $=2$, and
PATT $=(1,2,2,1,2,1,1,2)$. A line red-black pattern in two
dimensions with the lines of the same color running
horizontally as in the pattern
\[ \begin{tabular}{c}
B \\
R
\end{tabular} \]
would be specified with NXP $=1$, NYP $=2$, NZP $=1$, and
PATT $=(1,2)$. The 4-color pattern in two dimensions
\[ \begin{tabular}{cc}
B & Y \\
R & G
\end{tabular} \]
would be specified with NXP $=2$, NYP $=2$, NZP $=1$, and
PATT $=(1,2,3,4)$. A 4-color pattern is often applied
to matrices resulting from a 9-point finite difference stencil.
\end{enumerate}
There are many interesting colorings which cannot be generated
by either REDBLK or COLOR since they do not represent replications
of a rectangular pattern. One such coloring is ordering by
diagonals along a rectangular mesh:
\[ \begin{array}{ccc}
6 & 8 & 9 \\
3 & 5 & 7 \\
1 & 2 & 4
\end{array} \]
Such orderings are useful when the stencil is a 5-point star,
because they allow vectorization along diagonals for such methods
as SOR and IC(0). The coloring vector for such an ordering is
\[ \begin{array}{ccc}
3 & 4 & 5 \\
2 & 3 & 4 \\
1 & 2 & 3
\end{array} \]
or
\[ P = (1,2,3,2,3,4,3,4,5) \]
\newpage
\section{Error Conditions}
\label{ier}
\indent
The NSPCG package assigns to an integer variable IER certain
values to indicate error conditions (or lack of them) in the
program's execution. Positive values are assigned to warnings, and
negative values are assigned to fatal errors.
The values IER can be assigned and the corresponding meanings
are indicated in the following table.
\begin{table}[h]
\begin{center}
\begin{tabular}{|c|p{3.5in}|} \hline
IER & Meaning \\ \hline
$0$ & No error detected \\ \hline
$-1$ & Nonpositive matrix size N \\
$-2$ & Insufficient real workspace \\
$-3$ & Insufficient integer workspace \\
$-4$ & Nonpositive diagonal element \\
$-5$ & Nonexistent diagonal element \\
$-6$ & $A$ is not positive definite \\
$-7$ & $Q$ is not positive definite \\
$-8$ & Cannot permute matrix as requested \\
$-9$ & MDIM is not large enough to allow expansion
of matrix \\
$-10$ & Inadmissible parameter encountered \\
$-11$ & Incorrect storage mode for block method \\
$-12$ & Zero pivot encountered in factorization \\
$-13$ & Breakdown in direction vector calculation \\
$-14$ & Breakdown in attempt to perform rotation \\
$-15$ & Breakdown in iterate calculation \\
$-16$ & Unimplemented combination of parameters \\
$-18$ & Unable to perform eigenvalue estimation \\ \hline
$1$ & Failure to converge in ITMAX iterations \\
$2$ & ZETA too small -- reset to $500*$SRELPR \\
$3$ & ZBRENT failed to converge in MAXFN iterations
(signifies difficulty in eigenvalue
estimation) \\
$4$ & In ZBRENT, $f(a)$ and $f(b)$ have the same sign
(signifies difficulty in eigenvalue
estimation) \\
$5$ & Negative pivot encountered in factorization \\
\hline
\end{tabular}
\caption{Meanings of Assigned IER Values}
\end{center}
\end{table}
\newpage
\section{Notes on Use}
\label{notes}
\indent
If selected, scaling and unscaling the matrix may introduce
small changes in the coefficient matrix and the right hand side
vector RHS due to round-off errors in the computer arithmetic.
Also, many preconditioners will attempt to rearrange COEF and
JCOEF to enhance vectorization. Thus, the COEF and JCOEF arrays
on output will be equivalent to but will not, in general,
be identical to the input arrays.
For the Successive Overrelaxation (SOR) method, it is sometimes
possible to find red-black or multicolor orderings of the equations
for which the method has the same rate of convergence as for
the natural ordering. Thus,
a red-black, line red-black, or multicolor ordering may be
preferable to the natural ordering since greater vectorizability
is possible. The SSOR, IC, MIC, and block factorization methods,
however, have better convergence rates with natural ordering than
with multi-color orderings. Since these methods vectorize better
with a multicolor ordering, there is a tradeoff between fewer
expensive iterations and a greater number of cheap iterations.
Thus, there
might be ranges of problem sizes when one approach may be better
than the other. This comment also applies when comparing two
different preconditioners with different convergence rates such
as Jacobi and MIC. For small N, Jacobi may be preferable to the
more sophisticated MIC method because of its vectorizability and
the relatively small difference in the number of iterations. For
larger N, the difference in number of iterations will outweigh
the difference in time per iteration.
\newpage
\section{Use of Subroutine VFILL}
\label{vfill}
\indent
The vector U should contain an initial approximation to the
solution of the linear system before NSPCG is called. If the user
has no information for making such a guess, then the zero vector
may be used as the starting vector. The subroutine VFILL can be
used to fill a vector with a constant. For example,
\begin{verbatim}
CALL VFILL (N,U,VAL)
\end{verbatim}
fills the first N locations of the vector U with value VAL.
\newpage
\section{Sample Usage}
\label{sample}
\indent
In this section, several examples using NSPCG to solve a sample
matrix problem are given.
\bigskip
\noindent
{\bf Example 1:}
\bigskip
\indent
In this example, NSPCG was used to solve the linear system $Ax=b$
which resulted from discretizing the problem
\[ \lc \begin{array}{cc}
u_{xx} + 2u_{yy} = 0 & \mbox{on S $= [0,1]\times[0,1]$} \\
u = 1 + xy & \mbox{on boundary of S}
\end{array} \right. \]
using the standard 5-point central finite difference formula with
a mesh size of $h = \frac{1}{11}$. The finite difference stencil
at node $(i,j)$ is thus
\[ 6u_{i,j}-u_{i-1,j}-u_{i+1,j}-2u_{i,j+1}-2u_{i,j-1} = 0 \]
This resulted in a symmetric and positive definite matrix of order
$100$ with five nonzero diagonals. Symmetric diagonal storage was
used to represent the matrix. Thus, only the main diagonal and the
two nonzero super-diagonals were stored so MAXNZ $=3$. Also, the
JCOEF vector was $(0,1,10)$. The iterative method used was
the Modified Incomplete Cholesky (MIC) method with conjugate gradient
acceleration. The computer used to run the program was a Cyber 170/750
and the compiler used was FTN5. The program to generate the matrix
data and the output resulting from this call to NSPCG is given below:
\begin{verbatim}
PROGRAM MAIN (OUTPUT,TAPE6=OUTPUT)
C
C ... ARRAY DECLARATIONS.
C
REAL COEF(120,4), RHS(100), U(100), WKSP(600), UBAR(1),
A RPARM(30)
INTEGER JCOEF(4), IWKSP(300), IPARM(30), P(1), IP(1)
EXTERNAL CG, MIC2
C
NDIM = 120
MDIM = 4
NW = 600
INW = 300
C
C ... GENERATE COEF, JCOEF, AND RHS.
C
NX = 10
NY = 10
N = NX*NY
H = 1.0/FLOAT(NX + 1)
MAXNZ = 3
DO 10 I = 1,N
COEF(I,1) = 6.0
COEF(I,2) = -1.0
COEF(I,3) = -2.0
RHS(I) = 0.0
10 CONTINUE
K = 0
DO 30 J = 1,NY
Y = FLOAT(J)*H
DO 25 I = 1,NX
X = FLOAT(I)*H
K = K + 1
IF (J .EQ. 1) THEN
RHS(K) = RHS(K) + 2.0
ENDIF
IF (J .EQ. NY) THEN
RHS(K) = RHS(K) + 2.0*(1.0 + X)
COEF(K,3) = 0.0
ENDIF
IF (I .EQ. 1) THEN
RHS(K) = RHS(K) + 1.0
ENDIF
IF (I .EQ. NX) THEN
RHS(K) = RHS(K) + 1.0 + Y
COEF(K,2) = 0.0
ENDIF
25 CONTINUE
30 CONTINUE
JCOEF(1) = 0
JCOEF(2) = 1
JCOEF(3) = NX
CALL DFAULT (IPARM,RPARM)
C
C ... NOW, RESET SOME DEFAULT VALUES.
C
IPARM(2) = 50
IPARM(3) = 3
RPARM(1) = 1.0E-8
C
C ... GENERATE AN INITIAL GUESS FOR U AND CALL NSPCG.
C
CALL VFILL (N,U,0.0)
C
CALL NSPCG (MIC2,CG,NDIM,MDIM,N,MAXNZ,COEF,JCOEF,P,IP,
A U,UBAR,RHS,WKSP,IWKSP,NW,INW,IPARM,RPARM,IER)
STOP
END
\end{verbatim}
\newpage
\begin{verbatim}
INITIAL ITERATIVE PARAMETERS
PREPROCESSOR AND PRECONDITIONER PARAMETERS
IPARM(12) = 2 (NSTORE)
IPARM(13) = 0 (ISCALE)
IPARM(14) = 0 (IPERM )
IPARM(15) = 1 (IFACT )
IPARM(16) = 0 (LVFILL)
IPARM(17) = 0 (LTRUNC)
IPARM(18) = 2 (IPROPA)
IPARM(19) = -1 (KBLSZ )
IPARM(20) = -1 (NBL2D )
IPARM(21) = 1 (IFCTV )
IPARM(22) = 1 (IQLR )
IPARM(23) = 2 (ISYMM )
IPARM(24) = 0 (IELIM )
IPARM(25) = 1 (NDEG )
RPARM(13) = .00000000E+00 (TIMFAC)
RPARM(14) = .00000000E+00 (TIMTOT)
RPARM(15) = .35500000E-11 (TOL )
RPARM(16) = .00000000E+00 (AINF )
INITIAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 50 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-07 (ZETA )
RPARM( 2) = .20000000E+01 (EMAX )
RPARM( 3) = .10000000E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .00000000E+00 (TIMIT )
RPARM( 7) = .00000000E+00 (DIGIT1)
RPARM( 8) = .00000000E+00 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
\end{verbatim}
\newpage
\begin{verbatim}
CG
INTERMEDIATE OUTPUT AFTER EACH ITERATION
ITERATION CONVERGENCE EMAX EMIN
N S TEST
0 0 .99366E+01 .20000E+01 .10000E+01
1 1 .46168E-01 .10010E+01 .10010E+01
2 2 .57189E-02 .20232E+01 .10002E+01
3 3 .12255E-02 .24807E+01 .10001E+01
4 4 .23770E-03 .27522E+01 .10000E+01
5 5 .49325E-04 .28711E+01 .10000E+01
6 6 .87776E-05 .29024E+01 .10000E+01
7 7 .16811E-05 .29071E+01 .10000E+01
8 8 .42316E-06 .29074E+01 .10000E+01
9 9 .15339E-06 .29075E+01 .10000E+01
10 10 .38502E-07 .29075E+01 .10000E+01
11 11 .71532E-08 .29076E+01 .10000E+01
CG HAS CONVERGED IN 11 ITERATIONS
\end{verbatim}
\newpage
\begin{verbatim}
FINAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 11 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-07 (ZETA )
RPARM( 2) = .29076287E+01 (EMAX )
RPARM( 3) = .10000004E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .34800000E+00 (TIMIT )
RPARM( 7) = .81454998E+01 (DIGIT1)
RPARM( 8) = .78457903E+01 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
FINAL ITERATIVE PARAMETERS
PREPROCESSOR AND PRECONDITIONER PARAMETERS
IPARM(12) = 2 (NSTORE)
IPARM(13) = 0 (ISCALE)
IPARM(14) = 0 (IPERM )
IPARM(15) = 1 (IFACT )
IPARM(16) = 0 (LVFILL)
IPARM(17) = 0 (LTRUNC)
IPARM(18) = 1 (IPROPA)
IPARM(19) = -1 (KBLSZ )
IPARM(20) = -1 (NBL2D )
IPARM(21) = 1 (IFCTV )
IPARM(22) = 1 (IQLR )
IPARM(23) = 2 (ISYMM )
IPARM(24) = 0 (IELIM )
IPARM(25) = 1 (NDEG )
RPARM(13) = .22000000E-01 (TIMFAC)
RPARM(14) = .51200000E+00 (TIMTOT)
RPARM(15) = .35500000E-11 (TOL )
RPARM(16) = .00000000E+00 (AINF )
\end{verbatim}
\newpage
\noindent
{\bf Example 2:}
\bigskip
\indent
In this example, the same problem was solved but primary storage
format was used. Thus all five nonzero diagonals were stored.
The iterative method used was the Reduced System (RS) method with
conjugate gradient acceleration. To use this method, a red-black
coloring was applied to the mesh with the REDBLK facility. Since
NSPCG must permute the matrix, the P and IP vectors had to be
dimensioned to the problem size. The program to generate the matrix
data and the output resulting from this call to NSPCG is given below:
\begin{verbatim}
PROGRAM MAIN (OUTPUT,TAPE6=OUTPUT)
C
C ... ARRAY DECLARATIONS.
C
REAL COEF(120,5), RHS(100), U(100), WKSP(600), UBAR(1),
A RPARM(30)
INTEGER JCOEF(120,5), IWKSP(300), IPARM(30), P(100), IP(100)
EXTERNAL CG, RS6
C
NDIM = 120
MDIM = 5
NW = 600
INW = 300
C
C ... GENERATE COEF, JCOEF, AND RHS.
C
NX = 10
NY = 10
N = NX*NY
H = 1.0/FLOAT(NX + 1)
MAXNZ = 5
DO 10 I = 1,N
COEF(I,1) = 6.0
COEF(I,2) = -1.0
COEF(I,3) = -2.0
COEF(I,4) = -1.0
COEF(I,5) = -2.0
JCOEF(I,1) = I
JCOEF(I,2) = I + 1
JCOEF(I,3) = I + NX
JCOEF(I,4) = I - 1
JCOEF(I,5) = I - NX
RHS(I) = 0.0
10 CONTINUE
K = 0
DO 40 J = 1,NY
Y = FLOAT(J)*H
DO 35 I = 1,NX
X = FLOAT(I)*H
K = K + 1
IF (J .EQ. 1) THEN
RHS(K) = RHS(K) + 2.0
COEF(K,5) = 0.0
JCOEF(K,5) = 0
ENDIF
IF (J .EQ. NY) THEN
RHS(K) = RHS(K) + 2.0*(1.0 + X)
COEF(K,3) = 0.0
JCOEF(K,3) = 0
ENDIF
IF (I .EQ. 1) THEN
RHS(K) = RHS(K) + 1.0
COEF(K,4) = 0.0
JCOEF(K,4) = 0
ENDIF
IF (I .EQ. NX) THEN
RHS(K) = RHS(K) + 1.0 + Y
COEF(K,2) = 0.0
JCOEF(K,2) = 0
ENDIF
35 CONTINUE
40 CONTINUE
CALL DFAULT (IPARM,RPARM)
C
C ... NOW, RESET SOME DEFAULT VALUES.
C
IPARM(3) = 3
IPARM(12) = 1
IPARM(14) = 1
IPARM(23) = 0
C
C ... GENERATE AN INITIAL GUESS FOR U AND CALL NSPCG.
C
CALL VFILL (N,U,0.0)
C
CALL REDBLK (NDIM,N,MAXNZ,COEF,JCOEF,P,IP,1,IWKSP,IER)
CALL NSPCG (RS6,CG,NDIM,MDIM,N,MAXNZ,COEF,JCOEF,P,IP,
A U,UBAR,RHS,WKSP,IWKSP,NW,INW,IPARM,RPARM,IER)
STOP
END
\end{verbatim}
\newpage
\begin{verbatim}
INITIAL ITERATIVE PARAMETERS
PREPROCESSOR AND PRECONDITIONER PARAMETERS
IPARM(12) = 1 (NSTORE)
IPARM(13) = 0 (ISCALE)
IPARM(14) = 1 (IPERM )
IPARM(15) = 1 (IFACT )
IPARM(16) = 0 (LVFILL)
IPARM(17) = 0 (LTRUNC)
IPARM(18) = 2 (IPROPA)
IPARM(19) = -1 (KBLSZ )
IPARM(20) = -1 (NBL2D )
IPARM(21) = 1 (IFCTV )
IPARM(22) = 1 (IQLR )
IPARM(23) = 0 (ISYMM )
IPARM(24) = 0 (IELIM )
IPARM(25) = 1 (NDEG )
RPARM(13) = .00000000E+00 (TIMFAC)
RPARM(14) = .00000000E+00 (TIMTOT)
RPARM(15) = .35500000E-11 (TOL )
RPARM(16) = .00000000E+00 (AINF )
INITIAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 100 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-05 (ZETA )
RPARM( 2) = .20000000E+01 (EMAX )
RPARM( 3) = .10000000E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .00000000E+00 (TIMIT )
RPARM( 7) = .00000000E+00 (DIGIT1)
RPARM( 8) = .00000000E+00 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
\end{verbatim}
\newpage
\begin{verbatim}
CG
INTERMEDIATE OUTPUT AFTER EACH ITERATION
ITERATION CONVERGENCE EMAX EMIN
N S TEST
0 0 .58026E+01 .20000E+01 .10000E+01
1 1 .43389E+00 .63392E+00 .63392E+00
2 2 .39789E+00 .86043E+00 .29993E+00
3 3 .52881E+00 .92756E+00 .15453E+00
4 4 .36826E+00 .97104E+00 .98805E-01
5 5 .20966E+00 .98764E+00 .83754E-01
6 6 .80236E-01 .99236E+00 .80254E-01
7 7 .26052E-01 .99496E+00 .79652E-01
8 8 .16576E-01 .99605E+00 .79530E-01
9 9 .70079E-02 .99698E+00 .79406E-01
10 10 .23601E-02 .99731E+00 .79383E-01
11 11 .12251E-02 .99754E+00 .79377E-01
12 12 .41032E-03 .99777E+00 .79374E-01
13 13 .14115E-03 .99785E+00 .79374E-01
14 14 .44531E-04 .99795E+00 .79374E-01
15 15 .13200E-04 .99802E+00 .79374E-01
16 16 .51949E-05 .99804E+00 .79374E-01
17 17 .12208E-05 .99806E+00 .79374E-01
18 18 .24719E-06 .99807E+00 .79374E-01
CG HAS CONVERGED IN 18 ITERATIONS
\end{verbatim}
\newpage
\begin{verbatim}
FINAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 18 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-05 (ZETA )
RPARM( 2) = .99806941E+00 (EMAX )
RPARM( 3) = .79373655E-01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .31900000E+00 (TIMIT )
RPARM( 7) = .66069656E+01 (DIGIT1)
RPARM( 8) = .71292805E+01 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
FINAL ITERATIVE PARAMETERS
PREPROCESSOR AND PRECONDITIONER PARAMETERS
IPARM(12) = 1 (NSTORE)
IPARM(13) = 0 (ISCALE)
IPARM(14) = 1 (IPERM )
IPARM(15) = 1 (IFACT )
IPARM(16) = 0 (LVFILL)
IPARM(17) = 0 (LTRUNC)
IPARM(18) = 2 (IPROPA)
IPARM(19) = -1 (KBLSZ )
IPARM(20) = -1 (NBL2D )
IPARM(21) = 1 (IFCTV )
IPARM(22) = 1 (IQLR )
IPARM(23) = 0 (ISYMM )
IPARM(24) = 0 (IELIM )
IPARM(25) = 1 (NDEG )
RPARM(13) = .00000000E+00 (TIMFAC)
RPARM(14) = .53800000E+00 (TIMTOT)
RPARM(15) = .35500000E-11 (TOL )
RPARM(16) = .00000000E+00 (AINF )
\end{verbatim}
\newpage
\noindent
{\bf Example 3:}
\bigskip
\indent
In this example, the same problem was solved using the Line SOR
method with line red-black ordering. To use this method,
a line red-black coloring was applied to the mesh with the COLOR
facility. Since NSPCG must permute the matrix, the P and IP
vectors had to be dimensioned to the problem size. The matrix
was stored in the symmetric diagonal storage format. Note that
even though only three nonzero diagonals were stored, it was
necessary to dimension the COEF and JCOEF arrays to be large
enough to store the permuted matrix. The program to generate the
matrix data and the output resulting from this call to NSPCG is
given below:
\begin{verbatim}
PROGRAM MAIN (OUTPUT,TAPE6=OUTPUT)
C
C ... ARRAY DECLARATIONS.
C
REAL COEF(120,5), RHS(100), U(100), WKSP(600), UBAR(1),
A RPARM(30)
INTEGER JCOEF(5), IWKSP(300), IPARM(30), P(100), IP(100)
INTEGER PATT(2)
EXTERNAL SOR, SOR7
C
NDIM = 120
MDIM = 5
NW = 600
INW = 300
C
C ... GENERATE COEF, JCOEF, AND RHS.
C
NX = 10
NY = 10
NZ = 1
N = NX*NY
H = 1.0/FLOAT(NX + 1)
MAXNZ = 3
DO 10 I = 1,N
COEF(I,1) = 6.0
COEF(I,2) = -1.0
COEF(I,3) = -2.0
RHS(I) = 0.0
10 CONTINUE
K = 0
DO 30 J = 1,NY
Y = FLOAT(J)*H
DO 25 I = 1,NX
X = FLOAT(I)*H
K = K + 1
IF (J .EQ. 1) THEN
RHS(K) = RHS(K) + 2.0
ENDIF
IF (J .EQ. NY) THEN
RHS(K) = RHS(K) + 2.0*(1.0 + X)
COEF(K,3) = 0.0
ENDIF
IF (I .EQ. 1) THEN
RHS(K) = RHS(K) + 1.0
ENDIF
IF (I .EQ. NX) THEN
RHS(K) = RHS(K) + 1.0 + Y
COEF(K,2) = 0.0
ENDIF
25 CONTINUE
30 CONTINUE
JCOEF(1) = 0
JCOEF(2) = 1
JCOEF(3) = NX
CALL DFAULT (IPARM,RPARM)
C
C ... NOW, RESET SOME DEFAULT VALUES.
C
IPARM(3) = 3
IPARM(14) = 1
C
C ... GENERATE AN INITIAL GUESS FOR U AND CALL NSPCG.
C
CALL VFILL (N,U,0.0)
C
NXP = 1
NYP = 2
NZP = 1
PATT(1) = 1
PATT(2) = 2
CALL COLOR (NXP,NYP,NZP,NX,NY,NZ,PATT,P)
CALL NSPCG (SOR7,SOR,NDIM,MDIM,N,MAXNZ,COEF,JCOEF,P,IP,
A U,UBAR,RHS,WKSP,IWKSP,NW,INW,IPARM,RPARM,IER)
STOP
END
\end{verbatim}
\newpage
\begin{verbatim}
INITIAL ITERATIVE PARAMETERS
PREPROCESSOR AND PRECONDITIONER PARAMETERS
IPARM(12) = 2 (NSTORE)
IPARM(13) = 0 (ISCALE)
IPARM(14) = 1 (IPERM )
IPARM(15) = 1 (IFACT )
IPARM(16) = 0 (LVFILL)
IPARM(17) = 0 (LTRUNC)
IPARM(18) = 2 (IPROPA)
IPARM(19) = -1 (KBLSZ )
IPARM(20) = -1 (NBL2D )
IPARM(21) = 1 (IFCTV )
IPARM(22) = 1 (IQLR )
IPARM(23) = 2 (ISYMM )
IPARM(24) = 0 (IELIM )
IPARM(25) = 1 (NDEG )
RPARM(13) = .00000000E+00 (TIMFAC)
RPARM(14) = .00000000E+00 (TIMTOT)
RPARM(15) = .35500000E-11 (TOL )
RPARM(16) = .00000000E+00 (AINF )
INITIAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 100 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-05 (ZETA )
RPARM( 2) = .20000000E+01 (EMAX )
RPARM( 3) = .10000000E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .00000000E+00 (TIMIT )
RPARM( 7) = .00000000E+00 (DIGIT1)
RPARM( 8) = .00000000E+00 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
\end{verbatim}
\newpage
\begin{verbatim}
SOR
INTERMEDIATE OUTPUT AFTER EACH ITERATION
NUMBER OF CONVERGENCE EMAX OMEGA SPECTRAL
ITERATIONS TEST RADIUS
0 0 .10000E+04 .20000E+01 .10000E+01 .00000E+00
1 0 .10000E+04 .20000E+01 .10000E+01 .61609E+00
2 0 .60559E+00 .20000E+01 .10000E+01 .63475E+00
3 0 .65576E+00 .20000E+01 .10000E+01 .77061E+00
4 0 .67933E+00 .91212E+00 .10000E+01 .83197E+00
5 1 .67933E+00 .91212E+00 .14185E+01 .14752E+01
6 1 .67933E+00 .91212E+00 .14185E+01 .10876E+01
7 1 .67933E+00 .91212E+00 .14185E+01 .77707E+00
8 1 .36291E+00 .91212E+00 .14185E+01 .71832E+00
9 1 .23778E+00 .91212E+00 .14185E+01 .69934E+00
10 1 .10202E+00 .91212E+00 .14185E+01 .69195E+00
11 1 .69782E-01 .91212E+00 .14185E+01 .68947E+00
12 1 .47922E-01 .91212E+00 .14185E+01 .68862E+00
13 1 .32945E-01 .91212E+00 .14185E+01 .68826E+00
14 1 .22660E-01 .91212E+00 .14185E+01 .68813E+00
15 1 .14844E-01 .94045E+00 .14185E+01 .68808E+00
16 2 .14844E-01 .94045E+00 .14926E+01 .74939E+00
17 2 .14844E-01 .94045E+00 .14926E+01 .74302E+00
18 2 .14844E-01 .94045E+00 .14926E+01 .65901E+00
19 2 .27382E-02 .94045E+00 .14926E+01 .61710E+00
20 2 .15091E-02 .94045E+00 .14926E+01 .59201E+00
21 2 .83411E-03 .94045E+00 .14926E+01 .57533E+00
22 2 .45715E-03 .94045E+00 .14926E+01 .56343E+00
23 2 .24843E-03 .94045E+00 .14926E+01 .55452E+00
24 2 .13395E-03 .94045E+00 .14926E+01 .54759E+00
25 2 .71687E-04 .94045E+00 .14926E+01 .54206E+00
26 2 .38156E-04 .94045E+00 .14926E+01 .53753E+00
27 2 .20202E-04 .94045E+00 .14926E+01 .53376E+00
28 2 .10645E-04 .94045E+00 .14926E+01 .53057E+00
29 2 .55865E-05 .94045E+00 .14926E+01 .52784E+00
30 2 .29208E-05 .94045E+00 .14926E+01 .52547E+00
31 2 .15221E-05 .94045E+00 .14926E+01 .52339E+00
32 2 .79083E-06 .94045E+00 .14926E+01 .52156E+00
SOR HAS CONVERGED IN 32 ITERATIONS
\end{verbatim}
\newpage
\begin{verbatim}
FINAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 32 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-05 (ZETA )
RPARM( 2) = .94045018E+00 (EMAX )
RPARM( 3) = .10000000E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .52100000E+00 (TIMIT )
RPARM( 7) = .61019185E+01 (DIGIT1)
RPARM( 8) = .63177121E+01 (DIGIT2)
RPARM( 9) = .14926136E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .52156493E+00 (SPECR )
FINAL ITERATIVE PARAMETERS
PREPROCESSOR AND PRECONDITIONER PARAMETERS
IPARM(12) = 2 (NSTORE)
IPARM(13) = 0 (ISCALE)
IPARM(14) = 1 (IPERM )
IPARM(15) = 1 (IFACT )
IPARM(16) = 0 (LVFILL)
IPARM(17) = 0 (LTRUNC)
IPARM(18) = 1 (IPROPA)
IPARM(19) = -1 (KBLSZ )
IPARM(20) = -1 (NBL2D )
IPARM(21) = 1 (IFCTV )
IPARM(22) = 1 (IQLR )
IPARM(23) = 2 (ISYMM )
IPARM(24) = 0 (IELIM )
IPARM(25) = 1 (NDEG )
RPARM(13) = .50000000E-02 (TIMFAC)
RPARM(14) = .75100000E+00 (TIMTOT)
RPARM(15) = .35500000E-11 (TOL )
RPARM(16) = .00000000E+00 (AINF )
\end{verbatim}
\newpage
\section{Using NSPCG in Matrix Format-Free Mode}
\label{mffm}
\indent
When the NSPCG subroutine is called, the user is required to use
one of the allowed data structures. However, it is possible to call
the acceleration routines directly in which case the user supplies
customized routines for performing certain matrix operations. In
this manner, an iterative algorithm can be designed which is suitable
to a particular application. Also, storage demands may preclude
copying the matrix into one of NSPCG's allowed matrix formats.
In this case, it may be necessary to use the data format available
from the application code itself when writing the subroutines for
matrix operations.
The subroutines which the user must supply to perform certain
matrix operations are denoted SUBA, SUBAT, SUBQL, SUBQR, SUBQLT,
SUBQRT, SUBQ, and SUBADP. Not all of these subroutines are needed
for every accelerator. The user should consult the calling sequences
for the accelerators to determine which subroutines need to be supplied.
SUBA and SUBAT are routines for performing $y=Ax$ and $y=A^Tx$,
respectively. Their calling sequences are
\begin{verbatim}
SUBA (COEF,JCOEF,WFAC,JWFAC,N,X,Y)
\end{verbatim}
and
\begin{verbatim}
SUBAT (COEF,JCOEF,WFAC,JWFAC,N,X,Y)
\end{verbatim}
where
\bigskip
\begin{list}{}{\labelwidth 0.70in
\leftmargin 1.00in \rightmargin 0.25in}
\item[COEF \hfill] is a real vector. [real, input]
\item[JCOEF \hfill] is an integer vector. [integer, input]
\item[WFAC \hfill] is a real vector. [real, input]
\item[JWFAC \hfill] is an integer vector. [integer, input]
\item[N \hfill] is the order of the linear system. [integer, input]
\item[X \hfill] is a real vector of length N containing the
input vector (the vector to be multiplied by) on input.
It should be unchanged on output. [real, input]
\item[Y \hfill] is a real vector of length N containing the
output vector (resultant product vector) on output.
[real, output]
\end{list}
\bigskip
\noindent
The vectors COEF, JCOEF, WFAC, and JWFAC are simply passed by
the acceleration routines to the matrix-vector product routines SUBA
and SUBAT. Hence, the user may use these vectors for any purpose
inside SUBA or SUBAT. For example, they can be used to represent
the operator $A$ or workspace needed to compute $Ax$ or $A^Tx$.
If any of these vectors is not needed, it is not necessary to
declare them.
SUBQL, SUBQR, SUBQLT, and SUBQRT are routines for performing
\mbox{$y=Q_L^{-1}x$}, \mbox{$y=Q_R^{-1}x$},
\mbox{$y=Q_L^{-T}x$}, and \mbox{$y=Q_R^{-T}x$},
respectively. These routines assume that the preconditioning
operator $Q$ can be written as $Q=Q_LQ_R$ and that the preconditioned
matrix is $Q_L^{-1}AQ_R^{-1}$. Note that $Q_R=I$ for left
preconditioning and $Q_L=I$ for right preconditioning. These
four subroutines have the same calling sequences as SUBA and
SUBAT. As with these two subroutines, the parameters COEF,
JCOEF, WFAC, and JWFAC can be used for any purpose by the user
since they are only passed along by the accelerator. Also,
X is the input vector (the right-hand-side vector in the
system $Q_Ly=x$) and Y is the output vector (the solution
to the system). The NSPCG library contains a routine called
COPY with the argument list given above which simply performs
a copy of the vector X into vector Y. Hence if the user wishes
to specify that $Q_R=I$ (i.e., only a left preconditioner is
desired), then COPY should be used for SUBQR and SUBQRT.
SUBQ is used only in the SOR routine and computes an SOR iteration.
Thus SUBQ computes
\[ u^{(n+1)} = Gu^{(n)} + k \]
where
\[ G = I - Q^{-1}A \]
and
\[ k = Q^{-1}b \]
Hence SUBQ computes
\[ u^{(n+1)} = \lp \oom D-C_L \rp^{-1}\lb \lp \oom-1 \rp Du^{(n)}
+ C_Uu^{(n)}+b \rb \]
where
\[ A = D-C_L-C_U \]
Its calling sequence is
\begin{verbatim}
SUBQ (COEF,JCOEF,WFAC,JWFAC,N,U,RHS,UNEW)
\end{verbatim}
The interpretation of the parameters COEF, JCOEF, WFAC, JWFAC,
and N is the same as before. U is a real vector of length N
which contains the current solution vector $u^{(n)}$ on input, and
it should not be changed on output. UNEW is a real vector of length
N which should contain on output the new solution vector $u^{(n+1)}$
after one SOR iteration. RHS is a real vector of length N containing
the right-hand-side $b$ of the linear system on input; it does not
need to be preserved on output and so can be used for workspace
if needed.
SUBADP is used only in the adaptive SRCG and SRSI routines
and is used to calculate certain quantities needed for the adaptive
procedures for $\om$. Its calling sequence is
\begin{verbatim}
SUBADP (COEF,JCOEF,WFAC,JWFAC,N,P,R,PDP,PLDUP)
\end{verbatim}
The interpretation of the parameters COEF, JCOEF, WFAC, JWFAC,
and N is the same as before. P is a real vector of length N which
contains on input a direction vector from the conjugate gradient
or Chebyshev algorithms. It should not be changed on output.
R is a real vector of length N from the accelerator routine which
can be used for workspace. PDP is a real constant which contains on
output the quantity $(p,Dp)$ where $A=D-C_L-C_U$. PLDUP is a real
constant which contains on output the quantity $(p,C_LD^{-1}C_Up)$.
Note that the user is free to define $D$ as the diagonal of $A$ or
as the block diagonal part of $A$, as long as the SUBQL, SUBQR,
SUBQLT, and SUBQRT routines are consistently defined.
If the user does not wish to adapt on $\om$, the nonadaptive CG and SI
routines should be called instead of the adaptive ones
since they do not require the user to supply SUBADP.
If the SOR, SRCG, or SRSI accelerators are called, the user
may need the value of $\om$ used in the accelerator for the
lower level matrix operation routines. The user should include the
following two lines of code in each subroutine requiring $\om$:
\begin{verbatim}
LOGICAL OMGADP
COMMON / ITCOM5 / OMGADP, OMEGA, ALPHAB, BETAB, FFF, SPECR
\end{verbatim}
The variable OMEGA contains the value of $\om$.
The calling sequences for the acceleration routines with the
corresponding $\la$~{\em accel}~$\ra$ entries
on the left are as follows:
\bigskip
\begin{list}{}{\labelwidth 0.70in
\leftmargin 1.00in \rightmargin 0.25in}
\item[CG \hfill]
{\tt CGW (SUBA,SUBQL, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[SI \hfill]
{\tt SIW (SUBA,SUBQL, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[SOR \hfill]
{\tt SORW (SUBA,SUBQ, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[SRCG \hfill]
{\tt SRCGW (SUBA,SUBQL,SUBADP, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[SRSI \hfill]
{\tt SRSIW (SUBA,SUBQL,SUBADP, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[BASIC \hfill]
{\tt BASICW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[ME \hfill]
{\tt MEW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[CGNR \hfill]
{\tt CGNRW (SUBA,SUBAT,SUBQL,SUBQLT,SUBQR,SUBQRT, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[LSQR \hfill]
{\tt LSQRW (SUBA,SUBAT,SUBQL,SUBQLT,SUBQR,SUBQRT, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[ODIR \hfill]
{\tt ODIRW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[OMIN \hfill]
{\tt OMINW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[ORES \hfill]
{\tt ORESW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[IOM \hfill]
{\tt IOMW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[GMRES \hfill]
{\tt GMRESW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[USYMLQ \hfill]
{\tt USLQW (SUBA,SUBAT,SUBQL,SUBQLT,SUBQR,SUBQRT, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[USYMQR \hfill]
{\tt USQRW (SUBA,SUBAT,SUBQL,SUBQLT,SUBQR,SUBQRT, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[LANDIR \hfill]
{\tt LDIRW (SUBA,SUBAT,SUBQL,SUBQLT,SUBQR,SUBQRT, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[LANMIN \hfill]
{\tt LMINW (SUBA,SUBAT,SUBQL,SUBQLT,SUBQR,SUBQRT, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[LANRES \hfill]
{\tt LRESW (SUBA,SUBAT,SUBQL,SUBQLT,SUBQR,SUBQRT, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\item[BCGS \hfill]
{\tt BCGSW (SUBA,SUBQL,SUBQR, \\
\hspace*{0.75in} COEF,JCOEF,WFAC,JWFAC,N,U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER) }
\end{list}
\bigskip
\noindent
where
\bigskip
\begin{list}{}{\labelwidth 0.70in
\leftmargin 1.00in \rightmargin 0.25in}
\item[SUBA \hfill]
is the name of a matrix-vector multiplication routine
which must be declared EXTERNAL in the user's calling
routine. It computes $y=Ax$. [name]
\item[SUBAT \hfill]
is the name of a transpose matrix-vector multiplication
routine which must be declared EXTERNAL in the user's
calling routine. It computes $y=A^Tx$. [name]
\item[SUBQL \hfill]
is the name of the left-preconditioning routine which
must be declared EXTERNAL in the user's calling routine.
It solves $Q_Ly=x$ for $y$ given $x$. [name]
\item[SUBQR \hfill]
is the name of the right-preconditioning routine which
must be declared EXTERNAL in the user's calling routine.
It solves $Q_Ry=x$ for $y$ given $x$. [name]
\item[SUBQLT \hfill]
is the name of the transpose left-preconditioning routine
which must be declared EXTERNAL in the user's calling
routine. It solves $Q_L^Ty=x$ for $y$ given $x$.
[name]
\item[SUBQRT \hfill]
is the name of the transpose right-preconditioning routine
which must be declared EXTERNAL in the user's calling
routine. It solves $Q_R^Ty=x$ for $y$ given $x$.
[name]
\item[SUBQ \hfill]
is the name of the SOR iteration routine which must be
declared EXTERNAL in the user's calling routine.
It computes an SOR iteration. [name]
\item[SUBADP \hfill]
is the name of the SSOR adaptive procedure routine which
must be declared EXTERNAL in the user's calling routine.
It computes parameters necessary for adapting on
$\om$ for the SRCGW and SRSIW acceleration routines.
[name]
\item[COEF \hfill]
is a real vector which is passed to the lower level
matrix routines. [real, input]
\item[JCOEF \hfill]
is an integer vector which is passed to the lower level
matrix routines. [integer, input]
\item[WFAC \hfill]
is a real vector which is passed to the lower level
matrix routines. [real, input]
\item[JWFAC \hfill]
is an integer vector which is passed to the lower level
matrix routines. [integer, input]
\item[N \hfill]
is the order of the linear system. [integer, input]
\item[U \hfill]
is a real vector of length N containing the current
estimate of the solution to the linear system on
input. The user must supply an initial guess, which
can be all zeros if no information is known. On
output, U contains an improved estimate of the solution
vector if convergence is achieved. [real, input/output]
\item[UBAR \hfill]
is a vector of length N containing the exact solution
to the linear system, if known. This vector is used
only if the exact stopping test is used. Otherwise,
it may be dimensioned to be of length one. [real, input]
\item[RHS \hfill]
is a vector of length N containing the right-hand-side
of the matrix problem. [real, input]
\item[WKSP \hfill]
is a real vector of length NW used for workspace.
[real, input]
\item[NW \hfill]
is an integer variable indicating the length of the
WKSP vector on input and the amount used on output.
If insufficient real workspace is provided, NW is
set on output to the amount of workspace needed
at the point execution terminated.
[integer, input/output]
\item[IPARM \hfill]
is an integer vector of length $25$ giving various
integer parameters which are described in Section
~\ref{params}. Only parameters $1$ through $11$ and
$22$ are relevant when the package is called at this
level. [integer, input/output]
\item[RPARM \hfill]
is a real vector of length $16$ giving various real
parameters which are described in Section~\ref{params}.
Only parameters $1$ through $12$ are relevant when
the package is called at this level. [real, input/output]
\item[IER \hfill]
is an error flag. A nonzero value on output indicates
a fatal or warning error was detected during the
iteration. See Section\ref{ier} on error conditions
for the meaning of its assigned values. [integer, output]
\end{list}
\newpage
\indent
Some examples of the use of NSPCG in matrix format-free mode
follow:
\bigskip
\noindent
{\bf Example 1:}
\bigskip
\indent
In this example, NSPCG was used to solve the linear system $Ax=b$
which resulted from discretizing the problem
\[ \lc \begin{array}{cc}
u_{xx} + 2u_{yy} = 0 & \mbox{on S $= [0,1]\times[0,1]$} \\
u = 1 + xy & \mbox{on boundary of S}
\end{array} \right. \]
using the standard 5-point central finite difference formula with
a mesh size of $h = \frac{1}{11}$. The finite difference stencil
at node $(i,j)$ is thus
\[ 6u_{i,j}-u_{i-1,j}-u_{i+1,j}-2u_{i,j+1}-2u_{i,j-1} = 0 \]
This resulted in a symmetric and positive definite matrix of order
$100$ with five nonzero diagonals. The matrix coefficients were
not stored in an array, but a subroutine called MULT was written
to compute the matrix-vector product, $y=Ax$. The iterative method
used was Richardson's method with conjugate gradient acceleration.
Note that for Richardson's method, $Q=I$, so COPY was used for
the SUBQL parameter in the CGW parameter list.
The computer used to run the program was a Cyber 170/750
and the compiler used was FTN5. The program to set up the problem
and the output resulting from this call to CGW is given below:
\begin{verbatim}
PROGRAM MAIN (OUTPUT,TAPE6=OUTPUT)
C
C ... ARRAY DECLARATIONS.
C
REAL RHS(100), U(100), WKSP(600), UBAR(1), RPARM(30)
INTEGER IPARM(30)
EXTERNAL MULT, COPY
C
NW = 600
C
C ... GENERATE RHS.
C
NX = 10
NY = 10
N = NX*NY
H = 1.0/FLOAT(NX + 1)
DO 10 I = 1,N
RHS(I) = 0.0
10 CONTINUE
K = 0
DO 30 J = 1,NY
Y = FLOAT(J)*H
DO 25 I = 1,NX
X = FLOAT(I)*H
K = K + 1
IF (J .EQ. 1) RHS(K) = RHS(K) + 2.0
IF (J .EQ. NY) RHS(K) = RHS(K) + 2.0*(1.0 + X)
IF (I .EQ. 1) RHS(K) = RHS(K) + 1.0
IF (I .EQ. NX) RHS(K) = RHS(K) + 1.0 + Y
25 CONTINUE
30 CONTINUE
CALL DFAULT (IPARM,RPARM)
C
C ... NOW, RESET SOME DEFAULT VALUES.
C
IPARM(3) = 3
RPARM(1) = 1.0E-8
C
C ... GENERATE AN INITIAL GUESS FOR U AND CALL NSPCG.
C
CALL VFILL (N,U,0.0)
C
CALL CGW (MULT,COPY,COEF,JCOEF,WFAC,JWFAC,N,
A U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER)
STOP
END
SUBROUTINE MULT (COEF,JCOEF,WFAC,JWFAC,N,X,Y)
REAL X(N), Y(N)
NX = 10
C
C ... COMPUTE PRODUCT AS IF FIRST SUPERDIAGONAL AND FIRST
C SUBDIAGONAL WERE FULL.
C
Y(1) = 6.0*X(1) - X(2) - 2.0*X(NX+1)
DO 10 I = 2,NX
Y(I) = 6.0*X(I) - X(I+1) - X(I-1) - 2.0*X(I+NX)
10 CONTINUE
DO 15 I = NX+1,N-NX
Y(I) = 6.0*X(I) - X(I+1) - X(I-1) - 2.0*(X(I+NX) + X(I-NX))
15 CONTINUE
DO 20 I = N-NX+1,N-1
Y(I) = 6.0*X(I) - X(I+1) - X(I-1) - 2.0*X(I-NX)
20 CONTINUE
Y(N) = 6.0*X(N) - X(N-1) - 2.0*X(N-NX)
C
C ... MAKE CORRECTIONS TO Y VECTOR FOR ZEROS IN FIRST SUPERDIAGONAL
C AND FIRST SUBDIAGONAL.
C
DO 25 I = NX,N-NX,NX
Y(I) = Y(I) + X(I+1)
25 CONTINUE
DO 30 I = NX+1,N-NX+1,NX
Y(I) = Y(I) + X(I-1)
30 CONTINUE
RETURN
END
\end{verbatim}
\newpage
\begin{verbatim}
INITIAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 100 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-07 (ZETA )
RPARM( 2) = .20000000E+01 (EMAX )
RPARM( 3) = .10000000E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .00000000E+00 (TIMIT )
RPARM( 7) = .00000000E+00 (DIGIT1)
RPARM( 8) = .00000000E+00 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
\end{verbatim}
\newpage
\begin{verbatim}
CG
INTERMEDIATE OUTPUT AFTER EACH ITERATION
ITERATION CONVERGENCE EMAX EMIN
N S TEST
0 0 .13900E+02 .20000E+01 .10000E+01
1 1 .56024E+00 .37349E+01 .37349E+01
2 2 .49004E+00 .61940E+01 .19544E+01
3 3 .51510E+00 .74467E+01 .12374E+01
4 4 .52558E+00 .86621E+01 .86417E+00
5 5 .63725E+00 .97284E+01 .61624E+00
6 6 .89475E+00 .10417E+02 .39239E+00
7 7 .66075E+00 .10698E+02 .32758E+00
8 8 .68211E+00 .10905E+02 .28757E+00
9 9 .48761E+00 .11011E+02 .26502E+00
10 10 .24106E+00 .11096E+02 .25229E+00
11 11 .15764E+00 .11146E+02 .24748E+00
12 12 .94345E-01 .11245E+02 .24514E+00
13 13 .49491E-01 .11486E+02 .24418E+00
14 14 .29955E-01 .11621E+02 .24383E+00
15 15 .24829E-01 .11659E+02 .24363E+00
16 16 .23087E-01 .11676E+02 .24346E+00
17 17 .15022E-01 .11699E+02 .24323E+00
18 18 .83114E-02 .11718E+02 .24312E+00
19 19 .43446E-02 .11734E+02 .24308E+00
20 20 .31278E-02 .11743E+02 .24307E+00
21 21 .18751E-02 .11751E+02 .24306E+00
22 22 .16765E-02 .11754E+02 .24305E+00
23 23 .93634E-03 .11755E+02 .24304E+00
24 24 .47839E-03 .11756E+02 .24304E+00
25 25 .28610E-03 .11756E+02 .24304E+00
26 26 .16126E-03 .11757E+02 .24304E+00
27 27 .85505E-04 .11757E+02 .24304E+00
28 28 .56106E-04 .11757E+02 .24304E+00
29 29 .28246E-04 .11757E+02 .24304E+00
30 30 .16505E-04 .11757E+02 .24304E+00
31 31 .96348E-05 .11757E+02 .24304E+00
32 32 .62848E-05 .11757E+02 .24304E+00
33 33 .33737E-05 .11757E+02 .24304E+00
34 34 .12847E-05 .11757E+02 .24304E+00
35 35 .63580E-06 .11757E+02 .24304E+00
36 36 .25529E-06 .11757E+02 .24304E+00
37 37 .12834E-06 .11757E+02 .24304E+00
38 38 .64785E-07 .11757E+02 .24304E+00
39 39 .24616E-07 .11757E+02 .24304E+00
40 40 .10040E-07 .11757E+02 .24304E+00
41 41 .41463E-08 .11757E+02 .24304E+00
CG HAS CONVERGED IN 41 ITERATIONS
\end{verbatim}
\newpage
\begin{verbatim}
FINAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 41 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-07 (ZETA )
RPARM( 2) = .11756958E+02 (EMAX )
RPARM( 3) = .24304216E+00 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .59400000E+00 (TIMIT )
RPARM( 7) = .83823401E+01 (DIGIT1)
RPARM( 8) = .90374486E+01 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
\end{verbatim}
\newpage
\noindent
{\bf Example 2:}
\bigskip
\indent
In this example, the same problem given in Example 1 was solved
using the SOR method with the SORW module. In addition to the
module called MULT to compute the matrix-vector product, a routine
called SRPASS was written to compute an SOR iteration. Note that
the ITCOM5 common statement had to be included in the SRPASS routine
so that the value of $\om$ could be used. The program to set up
the problem and the output resulting from this call to SORW is given
below:
\begin{verbatim}
PROGRAM MAIN (OUTPUT,TAPE6=OUTPUT)
C
C ... ARRAY DECLARATIONS.
C
REAL RHS(100), U(100), WKSP(600), UBAR(1), RPARM(30)
INTEGER IPARM(30)
EXTERNAL MULT, SRPASS
C
NW = 600
C
C ... GENERATE RHS.
C
NX = 10
NY = 10
N = NX*NY
H = 1.0/FLOAT(NX + 1)
DO 10 I = 1,N
RHS(I) = 0.0
10 CONTINUE
K = 0
DO 30 J = 1,NY
Y = FLOAT(J)*H
DO 25 I = 1,NX
X = FLOAT(I)*H
K = K + 1
IF (J .EQ. 1) RHS(K) = RHS(K) + 2.0
IF (J .EQ. NY) RHS(K) = RHS(K) + 2.0*(1.0 + X)
IF (I .EQ. 1) RHS(K) = RHS(K) + 1.0
IF (I .EQ. NX) RHS(K) = RHS(K) + 1.0 + Y
25 CONTINUE
30 CONTINUE
CALL DFAULT (IPARM,RPARM)
C
C ... NOW, RESET SOME DEFAULT VALUES.
C
IPARM(3) = 3
RPARM(1) = 1.0E-8
C
C ... GENERATE AN INITIAL GUESS FOR U AND CALL NSPCG.
C
CALL VFILL (N,U,0.0)
C
CALL SORW (MULT,SRPASS,COEF,JCOEF,WFAC,JWFAC,N,
A U,UBAR,RHS,WKSP,NW,IPARM,RPARM,IER)
STOP
END
SUBROUTINE MULT (COEF,JCOEF,WFAC,JWFAC,N,X,Y)
REAL X(N), Y(N)
NX = 10
C
C ... COMPUTE PRODUCT AS IF FIRST SUPERDIAGONAL AND FIRST
C SUBDIAGONAL WERE FULL.
C
Y(1) = 6.0*X(1) - X(2) - 2.0*X(NX+1)
DO 10 I = 2,NX
Y(I) = 6.0*X(I) - X(I+1) - X(I-1) - 2.0*X(I+NX)
10 CONTINUE
DO 15 I = NX+1,N-NX
Y(I) = 6.0*X(I) - X(I+1) - X(I-1) - 2.0*(X(I+NX) + X(I-NX))
15 CONTINUE
DO 20 I = N-NX+1,N-1
Y(I) = 6.0*X(I) - X(I+1) - X(I-1) - 2.0*X(I-NX)
20 CONTINUE
Y(N) = 6.0*X(N) - X(N-1) - 2.0*X(N-NX)
C
C ... MAKE CORRECTIONS TO Y VECTOR FOR ZEROS IN FIRST SUPERDIAGONAL
C AND FIRST SUBDIAGONAL.
C
DO 25 I = NX,N-NX,NX
Y(I) = Y(I) + X(I+1)
25 CONTINUE
DO 30 I = NX+1,N-NX+1,NX
Y(I) = Y(I) + X(I-1)
30 CONTINUE
RETURN
END
SUBROUTINE SRPASS (COEF,JCOEF,WFAC,JWFAC,N,U,RHS,UNEW)
C
C ... SRPASS DOES AN SOR ITERATION.
C
C UNEW = INV((1/W)*D + L)*(((1/W)-1)*D*UN + RHS - U*UN)
C
C ... PARAMETERS --
C
C N ORDER OF SYSTEM
C U CURRENT SOLUTION VECTOR
C RHS RIGHT HAND SIDE
C UNEW UPDATED SOLUTION VECTOR
C
C ... SPECIFICATIONS FOR PARAMETERS
C
DIMENSION U(1), UNEW(1), RHS(1)
LOGICAL OMGADP
COMMON / ITCOM5 / OMGADP, OMEGA, ALPHAB, BETAB, FFF, SPECR
C
C ... TEMP = ((1/W)-1)*D*UN + RHS - U*UN
C UNEW IS USED FOR TEMP.
C
NX = 10
CON = 6.0*(1.0/OMEGA - 1.0)
DO 10 I = 1,N
UNEW(I) = CON*U(I) + RHS(I)
10 CONTINUE
DO 15 I = 1,N-1
UNEW(I) = UNEW(I) + U(I+1)
15 CONTINUE
DO 20 I = 1,N-NX
UNEW(I) = UNEW(I) + 2.0*U(I+NX)
20 CONTINUE
DO 25 I = NX,N-NX,NX
UNEW(I) = UNEW(I) - U(I+1)
25 CONTINUE
C
C ... UNEW = INV((1/W)*D + L)*TEMP
C
CON = OMEGA/6.0
DO 40 J = 1,NX
IBGN = (J-1)*NX + 1
IEND = J*NX
UNEW(IBGN) = CON*UNEW(IBGN)
DO 30 I = IBGN+1,IEND
UNEW(I) = CON*(UNEW(I) + UNEW(I-1))
30 CONTINUE
IF (J .EQ. NX) GO TO 40
DO 35 I = IBGN,IEND
UNEW(I+NX) = UNEW(I+NX) + 2.0*UNEW(I)
35 CONTINUE
40 CONTINUE
RETURN
END
\end{verbatim}
\newpage
\begin{verbatim}
INITIAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 100 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-07 (ZETA )
RPARM( 2) = .20000000E+01 (EMAX )
RPARM( 3) = .10000000E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .00000000E+00 (TIMIT )
RPARM( 7) = .00000000E+00 (DIGIT1)
RPARM( 8) = .00000000E+00 (DIGIT2)
RPARM( 9) = .10000000E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .00000000E+00 (SPECR )
\end{verbatim}
\newpage
\begin{verbatim}
SOR
INTERMEDIATE OUTPUT AFTER EACH ITERATION
NUMBER OF CONVERGENCE EMAX OMEGA SPECTRAL
ITERATIONS TEST RADIUS
0 0 .10000E+04 .20000E+01 .10000E+01 .00000E+00
1 0 .10000E+04 .20000E+01 .10000E+01 .53872E+00
2 0 .72510E+00 .20000E+01 .10000E+01 .70482E+00
3 0 .70622E+00 .20000E+01 .10000E+01 .78917E+00
4 0 .70165E+00 .91538E+00 .10000E+01 .83791E+00
5 1 .70165E+00 .91538E+00 .14259E+01 .18722E+01
6 1 .70165E+00 .91538E+00 .14259E+01 .81405E+00
7 1 .70165E+00 .91538E+00 .14259E+01 .82807E+00
8 1 .73605E+00 .91538E+00 .14259E+01 .83682E+00
9 1 .63941E+00 .91538E+00 .14259E+01 .84186E+00
10 1 .33339E+00 .91538E+00 .14259E+01 .84339E+00
11 1 .27588E+00 .91538E+00 .14259E+01 .84086E+00
12 1 .22191E+00 .91538E+00 .14259E+01 .83484E+00
13 1 .17539E+00 .91538E+00 .14259E+01 .82715E+00
14 1 .13764E+00 .91538E+00 .14259E+01 .81950E+00
15 1 .96272E-01 .91538E+00 .14259E+01 .81267E+00
16 1 .75684E-01 .91538E+00 .14259E+01 .80756E+00
17 1 .59578E-01 .91538E+00 .14259E+01 .80356E+00
18 1 .46828E-01 .91538E+00 .14259E+01 .80005E+00
19 1 .36758E-01 .91538E+00 .14259E+01 .79698E+00
\end{verbatim}
\hspace*{2.0in} \vdots
\begin{verbatim}
46 2 .33557E-05 .95950E+00 .15604E+01 .59139E+00
47 2 .19459E-05 .95950E+00 .15604E+01 .58663E+00
48 2 .11480E-05 .95950E+00 .15604E+01 .58801E+00
49 2 .69725E-06 .95950E+00 .15604E+01 .59582E+00
50 2 .43310E-06 .95950E+00 .15604E+01 .60581E+00
51 2 .26430E-06 .95950E+00 .15604E+01 .60755E+00
52 2 .14596E-06 .95950E+00 .15604E+01 .58457E+00
53 2 .83253E-07 .95950E+00 .15604E+01 .57860E+00
54 2 .51143E-07 .95950E+00 .15604E+01 .59313E+00
55 2 .30889E-07 .95950E+00 .15604E+01 .59749E+00
56 2 .17899E-07 .95950E+00 .15604E+01 .59011E+00
57 2 .10123E-07 .95950E+00 .15604E+01 .57979E+00
58 2 .55955E-08 .95950E+00 .15604E+01 .56811E+00
SOR HAS CONVERGED IN 58 ITERATIONS
\end{verbatim}
\newpage
\begin{verbatim}
FINAL ITERATIVE PARAMETERS
GENERAL AND ACCELERATION PARAMETERS
IPARM( 1) = 2 (NTEST )
IPARM( 2) = 58 (ITMAX )
IPARM( 3) = 3 (LEVEL )
IPARM( 4) = 6 (NOUT )
IPARM( 5) = 0 (IDGTS )
IPARM( 6) = 1 (MAXADP)
IPARM( 7) = 1 (MINADP)
IPARM( 8) = 1 (IOMGAD)
IPARM( 9) = 5 (NS1 )
IPARM(10) = 100000 (NS2 )
IPARM(11) = 0 (NS3 )
RPARM( 1) = .10000000E-07 (ZETA )
RPARM( 2) = .95950463E+00 (EMAX )
RPARM( 3) = .10000000E+01 (EMIN )
RPARM( 4) = .75000000E+00 (FF )
RPARM( 5) = .75000000E+00 (FFF )
RPARM( 6) = .62900000E+00 (TIMIT )
RPARM( 7) = .82521604E+01 (DIGIT1)
RPARM( 8) = .87703729E+01 (DIGIT2)
RPARM( 9) = .15604363E+01 (OMEGA )
RPARM(10) = .00000000E+00 (ALPHAB)
RPARM(11) = .25000000E+00 (BETAB )
RPARM(12) = .56811199E+00 (SPECR )
\end{verbatim}
\newpage
\section{Installation Guide}
\label{install}
\indent
This package is written in 1977 ANSI Standard Fortran in the
interests of portability. However some changes will have to be
made for different computers. The following changes are
recommended.
\begin{enumerate}
\item
The timing routine TIMER uses the Fortran function SECOND
to return the elapsed CPU time in seconds. For different
computers, this may have to be changed. Also, there may
be more accurate timing facilities available than SECOND.
\item
The value of the machine relative precision is contained in
the variable SRELPR, which is set in the DFAULT subroutine.
The relative precision SRELPR is the smallest positive machine
number such that $1 +$ SRELPR is greater than $1$. This and other
default values may be permanently changed when the code is
installed by changing their values in this subroutine. In
particular, SRELPR must be changed for different computers.
If the installer of this package does not know its value,
an approximate value can be determined from a simple Fortran
program given in the comment statements of subroutine DFAULT.
\item
KEYGS is a switch in DFAULT which may need to be changed for
different vector computers. KEYGS indicates whether
vectorized gather and scatter operations can be chained to
other vector operations or whether they need to be
done explicitly using workspace vectors.
\begin{tabular}{rp{5.0in}}
KEYGS $=1$ & if explicit gathering and scattering
is needed for vectorization \\
& \\
$=2$ & if vectorized gathering and scattering
can be chained to other vector operations
\end{tabular}
As an example, if the loop
\begin{verbatim}
DO 10 I = 1,N
C(I) = C(I) + A(IA(I))
10 CONTINUE
\end{verbatim}
can be vectorized by the compiler, KEYGS should be set to $2$.
If the loop needs to be rewritten as
\begin{verbatim}
DO 10 I = 1,N
WKSP(I) = A(IA(I))
C(I) = C(I) + WKSP(I)
10 CONTINUE
\end{verbatim}
for vectorization, KEYGS should be set to $1$.
KEYGS $= 1$ should be selected for the following computers:
\begin{tabular}{l}
CDC Cyber 205 \\
Cray-1, Cray-1S, Cray-1M \\
Cray X-MPs without hardware gather/scatter
\end{tabular}
KEYGS $= 2$ should be selected for the following computers:
\begin{tabular}{l}
Cray X-MPs with hardware gather/scatter \\
Cray-2 \\
All scalar computers
\end{tabular}
KEYGS will inform NSPCG if it needs to set aside workspace
for gather and scatter operations.
\item
The routines VGATHI, VGATHR, VSCATI, and VSCATR contain
Fortran implementations of gather and scatter instructions.
For the CDC Cyber 205, the Fortran DO loops should be
replaced by calls to Q8VGATHR and Q8VSCATR, which are
commented out. For the Cray-1 and Cray X-MPs without
hardware gather/scatter, the Fortran DO loops should be
replaced by calls to GATHER and SCATTER in SCILIB.
For vector machines with hardware gather/scatter, the
Fortran DO loops should be used since they vectorize as
they are.
\end{enumerate}
\newpage
\section{Acknowledgements}
\indent
The authors wish to acknowledge the support of the Center for
Numerical Analysis and Computation Center at The University of Texas
at Austin during the development of the NSPCG package. The support
and helpful comments from Dr. David M. Young are also gratefully
acknowledged.
The NSPCG package was developed with the aid of the Control Data
Dual Cyber 170/750 system operated by the Computation Center of The
University of Texas at Austin, the Control Data Cyber 205 system
operated by the Institute for Computational Studies of Colorado
State University, and the Cray X-MP system operated by the Center for
High Performance Computing of The University of Texas System. The
cooperation of each of these centers in making their facilities
available for use on this project is greatly appreciated.
The authors wish to acknowledge the support, in part, of the National
Science Foundation through Grant CCR-8518722, the Department of Energy
under Grant DE-FG05-87ER25048, Cray Research, Inc. through Grant LTR DTD,
the Control Data Corporation through its PACER Fellowship Program (1985-87),
and Sandia National Laboratory through Contract No. 06-4298 (1987-88).
\newpage
\begin{thebibliography}{99}
\bibitem{Adams} Adams, L.M. {\em Iterative Algorithms for Large
Sparse Linear Systems on Parallel Computers.} Doctoral dissertation.
University of Virginia, November 1982. Also published as NASA
CR-166027, NASA Langley Research Center.
\bibitem{Axel1} Axelsson, O. ``Incomplete Block Matrix
Factorization Preconditioning Methods. The Ultimate Answer?"
CNA-195, Center for Numerical Analysis, University of Texas, Austin,
Texas, 78712, July 1984.
\bibitem{Axel2} Axelsson, O. ``A Survey of Vectorizable Preconditioning
Methods for Large Scale Finite Element Matrix Problems." CNA-190,
Center for Numerical Analysis, University of Texas, Austin, Texas,
78712, February, 1984.
\bibitem{Birk1} Birkhoff, G. and Lynch, R.E. {\em Numerical Solution
of Elliptic Problems.} Philadelphia: SIAM, 1984.
\bibitem{CGL} Concus, P., Golub, G., and O'Leary, D. ``A Generalized
Conjugate Gradient Method for the Numerical Solution of Elliptic
Partial Differential Equations," in {\em Sparse Matrix Computations}
(eds. J. R. Bunch and Donald J. Rose). New York: Academic Press, Inc.,
1976.
\bibitem{squeeze} Dongarra, J.J. and Eisenstat, S.C.
``Squeezing the Most out of an Algorithm in CRAY FORTRAN."
{\em ACM Transactions on Mathematical Software,} Vol. 10, No. 3,
September 1984, pp. 219-230.
\bibitem{GCR} Eisenstat, S.C., Elman, H.C., and Schultz, M.
``Variational Iterative Methods for Nonsymmetric Systems of
Linear Equations." {\em SIAM Journal of Numerical Analysis,} Vol. 20,
No. 2, April 1983, pp. 345-357.
\bibitem{ME} Fridman, V.M. ``The Method of Minimum Iterations
with Minimum Errors for a System of Linear Algebraic Equations
with a Symmetrical Matrix", {\em USSR Computational Math. and Math.
Phys.}, 2:362-363 (1963).
\bibitem{Gust} Gustafsson, I. {\em Stability and Rate of Convergence
of Modified Incomplete Cholesky Factorization Methods.} Doctoral
dissertation. Chalmers University of Technology and the University
of G\"{o}teborg, April 1979.
\bibitem{Young1} Hageman, L. and Young, D.M. {\em Applied Iterative
Methods.} New York: Academic Press, Inc., 1981.
\bibitem{LANCZOS} Jea, K.C. and Young, D.M. ``On the
Simplification of Generalized Conjugate Gradient Methods for
Nonsymmetrizable Linear Systems." {\em Linear Algebra and its
Applications,} Vol 52/53, 1983, pp. 399-417.
\bibitem{BCGS} Joly, P. and Eymard, R. ``Preconditioned Biconjugate
Gradient Methods for Numerical Reservoir Simulation." To appear in
{\em Journal of Computational Physics.}
\bibitem{kershaw} Kershaw, D.S. ``The Incomplete Cholesky--Conjugate
Gradient Method for the Iterative Solution of Systems of Linear
Equations." {\em Journal of Computational Physics,} Vol. 26,
pp. 43-65.
\bibitem{itpackv2c} Kincaid, D., Oppe, T., Respess, J., and Young, D.
``ITPACKV 2C User's Guide." CNA-191, Center for Numerical Analysis,
University of Texas, Austin, Texas, 78712, February 1984.
\bibitem{itpack2c} Kincaid, D., Respess, J., Young, D., and Grimes, R.
``Algorithm 586 ITPACK 2C: A FORTRAN Package for Solving Large
Sparse Linear Systems by Adaptive Accelerated Iterative Methods."
{\em ACM Transactions on Mathematical Software,} Vol. 8, No. 3,
September 1982, pp. 302-322.
\bibitem{future} Kincaid, David R. and Young, David M. ``The
ITPACK Project: Past, Present, and Future." CNA-180, Center for
Numerical Analysis, University of Texas, Austin, Texas, 78712,
March 1983.
\bibitem{blas} Lawson, C.L., Hanson, R.J., Kincaid, D.R., and
Krough, F.T. ``Basic Linear Algebra Subprograms for Fortran Usage."
{\em ACM Transactions on Mathematical Software,} Vol. 5, No. 3,
September 1979, pp. 308-323.
\bibitem{Man1} Manteuffel, T.A. ``An Incomplete Factorization
Technique for Positive Definite Linear Systems." {\em Mathematics
of Computation,} Vol. 34, No. 150, April 1980, pp. 473-497.
\bibitem{Meij} Meijerink, J.A. and van der Vorst, H.A. ``An
Iterative Solution Method for Linear Systems of Which the
Coefficient Matrix is a Symmetric M-Matrix." {\em Mathematics
of Computation,} Vol. 31, No. 137, January 1977, pp. 148-162.
\bibitem{NSPCG} Oppe, T.C., Joubert, W.D., and Kincaid, D.R.
``Algorithms in NSPCG." In preparation.
\bibitem{SYMMLQ} Paige, C.C. and Saunders, M.A. ``Solution
of Sparse Indefinite Systems of Linear Equations." {\em SIAM
Journal of Numerical Analysis,} Vol. 12, No. 4, Sept. 1975,
pp. 617-629.
\bibitem{LSQR} Paige, C.C. and Saunders, M.A. ``LSQR: An Algorithm
for Sparse Linear Equations and Sparse Least Squares." {\em ACM
Transactions on Mathematical Software,} Vol. 8, No. 1, March 1982,
pp. 43-71.
\bibitem{PCGPAK} ``PCGPAK User's Guide (Version 1.2)." Scientific
Computing Associates, 1984.
\bibitem{ELLPACK} Rice, J.R. and Boisvert, R.F. (eds.)
{\em Solving Elliptic Problems Using ELLPACK.} Springer-Verlag,
New York, 1985.
\bibitem{IOM1} Saad, Y. ``Krylov Subspace Methods for Solving
Large Unsymmetric Linear Systems." {\em Math. Comp.,} 37 (July 1981),
pp. 105-126.
\bibitem{IOM2} Saad, Y. ``Practical Use of Some Krylov Subspace
Methods for Solving Indefinite and Nonsymmetric Linear Systems."
{\em SIAM Journal of Scientific and Statistical Computing,} Vol. 5,
No. 1, March 1984, pp. 203-228.
\bibitem{GMRES} Saad, Y. and Schultz, M.H. ``GMRES: A
Generalized Minimal Residual Algorithm for Solving Nonsymmetric
Linear Systems." {\em SIAM Journal of Scientific and Statistical
Computing,} Vol. 7, No. 3, July 1986, pp. 856-869.
\bibitem{YOUCEF} Saad, Y. and Schultz, M.H. ``Conjugate
Gradient-like Algorithms for Solving Nonsymmetric Linear
Systems." {\em Mathematics of Computation,} Vol. 44, No. 170,
April 1985, pp. 417-424.
\bibitem{Sonn1} Sonneveld, P. ``CGS, a Fast Lanczos-Type Solver
for Nonsymmetric Linear Systems." Report 84-16, Department of
Mathematics and Informatics, Delft University of Technology,
1984.
\bibitem{Varga1} Varga, R.S. {\em Matrix Iterative Analysis.}
Englewood Cliffs, N.J.: Prentice-Hall, Inc., 1962.
\bibitem{Wach1} Wachspress, E.L. {\em Iterative Solution of Elliptic
Systems and Applications to the Neutron Diffusion Equations
of Reactor Physics.} Englewood Cliffs, N.J.: Prentice-Hall, Inc.,
1966.
\bibitem{Wallis1} Wallis, J.R. ``Incomplete Gaussian Elimination
as a Preconditioning for Generalized Conjugate Gradient Acceleration."
SPE 12265. {\em SPE,} 1983, pp. 325-334.
\bibitem{Wallis2} Wallis, J.R., Kendall, R.P., and Little, T.E.
``Constrained Residual Acceleration of Conjugate Residual
Methods." SPE 13536. {\em SPE,} 1985, pp. 415-428.
\bibitem{Yip} Yip, E.L., Saunders, M.A., and Simon, H.D.
``Two Conjugate Gradient Type Methods for Sparse Unsymmetric
Linear Equations." Unpublished manuscript. Boeing Computer
Services, Seattle, Wash. 98124, 1984.
\bibitem{Young2} Young, D.M. {\em Iterative Solution of Large Linear
Systems.} New York: Academic Press, Inc., 1971.
\bibitem{ORTHO} Young, D.M. and Jea, K.C. ``Generalized Conjugate
Gradient Acceleration of Nonsymmetrizable Iterative Methods."
{\em Linear Algebra and its Applications,} 34:159-194 (1980).
\end{thebibliography}
\end{document}