You are on page 1of 944



 
        !
"$#%&' ! &)( '* &#%# !  
 #%# +,  !.- / 0'0100
23.4,5.678,9.:;<>=8?@<5.A=
;::CBDEEGFFF<>8H9.:C;<I=8C?<5.AC=HEJK2C3.4H5.6

L   M 'NO%'P &Q  #


R  +%  T# '*U       !
S
"$# ! )(  G V %  !
 G W X - / 00ZY\[0]^Y\[_`
a.bHc.;,9.dHe.f,9.?7HcG::;,2.8H9c<5.A=
;::B@DEE.F,5.fHa9G8B=,c.g<cG::;H2G8H9c<5.A=,Ea.bHcG;H9.d,e.fH9.?

hjik,lmGnoUpHq rtsIunk,owvyxIz,nC{o.k
Y Y  X   # | #%    ( !   !
Y ~}      ! # Ny      !
!    $   
Y ~ X%!!   #O &#     # t X   L@! 
T#  (     !
V  K     &M    #
    Ny # N P%X Q !  X    #
Y 1   #%  # N  X      # !
  X     #      !
V   X   N    M    #
Y 0   )# !  !
X!!I    N% #y &T#     #
% # | #    ( !   ! @&    #'G ! 
V  K '   M     #
Y ` V*  # !+
!  !t # Ny (T  U     !
'   M     # * (T     !
Y ~     | #%  $ Q% 
V N    #       !
 '      #
Y ~ #  | #    ( !  !
   # 'X!| #%    ( !   !
Y _ RS T#  # !
hjik,lmGnowWq nCmGHowlk,nC
} Y       +%  !
} ~} X Q !^+%  !
} ~ +% #  # NO| &#%  )# N  + G# N #  
| #  @# N  + # N #    # N RS + # N #  
} 1 !  !
} 0 V   X # N% T#     X Q !^+%  !
V   X%+  +
! &   #y#   + 
Y

'N  #  ( !  !- L@P  # -  # NyL@   #


V   X # N T#  V     '*| #%   &'Q 
} `  +%%!t # N )#   N #       !
hjik,lmGnowWq \uunowKoGmGk,u,o.
 Y )##% NX%  !
)#%#@ N%X%  ! #y X #    # %+ 
 ~} )# X% &  !
V   L'X (   M)#% X )(
  %  #        !
V  V*   # )#%G X% )(
 ~ /$   !
"#       !
X &  #   '*/$  !
 1 ! R#% U    !
        !
 0 L  + & #T    X% 
V   L%  !P (     &M    #
 ` L  + &       %+%  !
L@  +  /$X KQ, !
L@  +        %+'  ! # N )##% NX%  !
hjik,lmGnowWq sIusIsIk,m.sI,u
1 Y  #%&&M    #y  Q  !
X & Q% X    % #% !
% X   #  X%    # !
V   L !  ! @  #
1 ~}  #%&&M    # XN     X #    # !
1 ~ V   L '!  ! @  #
1 1 |  !   X%  !
1 0 R     #%  # N #  +,     #
 &(#    ++   &     #  # N )# +,    #
++   &     #  # N )#  +,    #yQ(#  X #    # !
G    Ny|  !   X  !
|  !   X  ! +%+        #O#y X #     # %+'  !
hjik,lmGnowWq oGm.i,z,,uWk,xIsIm\
0 Y  %  #  !  !
L@  +X     # ! #O  %  #  '!  !
0 ~} V     Z% T N @   !!
N  N    Z% T N @   !!
0 ~  %  #  U     !
V  K '   M     #
0 1  %  #   &(#    !
V   | # N   &(#  T  !
}

 
0 0  %  #
 
 
 
0 `  %  #
 

G  ( !  G !  %  #   &(#  T  !


        # ! # Ny|  !   X%  !
%  #        #
%  #  |  '!   X   !
%  #   (#  T  ! # Ny|  !   X%  !
 %X Q !+%  !
%  #  )(    $ X # N% T#  U  & %X Q !+  ! # N   $Z N%]
% $ #   &'
hjik,lmGnowWq KsIxIsI{o.sI
` Y + # ! # NyU!!  !
V    #&M    #y &#   + 
` ~}      /    P!
V    #%TM    ##   +   # N    &     Z  % #% G $
#   (
` ~    X%  X  ! &#y X  Q X 
!
hjik,lmGnowWq rtsIunk,owumGsIHujk,urtsIunCkHowmGn
 Y | #   X #    # !
| #  @ +      !
V   +%  | #   X #    # !
L@  +,!   # | #%   X #     # !
)# !  !
 ~} | #    ( !   !
V   X%+  +
! &   #y#   + 
)#    #% X%! ( !  !
%X%+  +,!    #O#   +  ! )#    'G#% X%! ( !  !
L@  +  % X    # !   t   ( !  !
 ~ N # !
   N' #  # N ! R#% | &#%   X #     # !
 #TM     #
 1 | #  V*  # !^     # !
L@% #% L  N #   !
 0 $T#%V  # !      # ! # N !  !
! (
hjik,lmGnowWq KsIz,nu%k,xInjk,uOuk,sICkHx.m.nC
 Y  !   N  | #%    ( !   !'  N #  (RS,#   X%    # !
 ~} #  X  ! # N #     !
!     ! #  X  !
#     !  !
 ~ RS   #  &M     #
 1 #  X  ! (TT       !
V   +      V    
 #TM     #y#   +  !

 0 RS(#     ( !  !


!   V    ( | #%    N #  (RS, G#   X%    # !
L@  +   ( !  !
V  #  L'! 
V   %!   #
 `    Q%& )( | #    ( !   !
 ~ V '] RS# !   #   ( !  !
R ! #  @t  #  X  !
L@  +  L@ # X   #  X  !
)#   + R X Q t  &'#  X 
L@  + SR X Q% &t  #  X 
 ~  Q    #   X%  X  ! # NOL  X  !
   Q    X%  X  !
"# !   Q% &   X  X  !
 ( !  !   R,# U!!  !
Z      #  # N R   + #
    L  X  !
 _    #  # N t ! #  #  
   #  # N t ! #  #  #  ( !  G !
hjik,lmGnowWq \m.nCo.k,m.sI,u,$rsIuWnCk,oj.m.nC
_ Y | #  )     ( !   !
%    ( !  !
  !     !
_ ~}    Q%& )(
 N  # !
V   !  # L   V     
_ ~   P    !!  !
_ 1 )     X    # '*| #%    ( !  !
V   'G Q  G N
V   X%!!  N G   %N
%X%  !!   tG     #  t
_ 0    /$   !
+ &  @   X  
  '++ # L  &  
L@ # X    'N # !
_ ` /$X T  L@  +%X     # ' #  X  !
V       %N
R    #
V  K     
hjik,lmGnoUpq ,uk,o.k,xInUKo.,{xInCsIuunUOsInu.sIHu
Y\[ Y '!      !
Y\[ }Z V   G# ! X #    #
V  RS   X #    #
1

L@ X X%! #%G  &M N X #    # !


V  # ! X #     #
Y\[ Z N &# ! # NO &#% X #   +  !
 # X  &#   +  !
G    N N' # !
)#    #% X%! X # N% ( L@ # N    # !
Y\[ 1    ! # NO%+ #% !
%+ &#% !
Y\[ 0   X  Z|  'X    X # N% ( W X   Q%  !
Y\[ ` #  T# !
 'P X   # !
hjik,lmGnoUpHp,q ,oGsInownoGsIn
Y'Y Y R(#    X%   # !L@ ## X'X!  N  
R X%!   #  # N $  @ 
 Q    # !   ! # Ny  # !
Y'Y }Z X    & !
  N    # !   # !
   !  L@ ## X'X! X #     # !
V   L ##  V    
#  # N N%N X #    # !
L@  +  X    & !
V  RS   X #    #
Y'Y Z R&
#    #  # N )#    #
)#      #  'X    !
R,#     #  X  &   !
     # !! # N RS  (
Y'Y 1 L@ #% '*% 
Y'Y 0 L@ ##       'X    !
L@ #'G #  #       +%  !
"#   L #'#  
     # !! # N RS  (
 &Q
 %+' 
L@ #'G #  # /$ 
L@  + #% !!
 &#)@ !  L #'G #  
hjik,lmGnoUpq ,oGsInowvyuk,xI.s
Y }Z Y R !   X  &   ! # N    '!  X V  # ! 
L@  +   !!   #  # NO/  ! tG   
V   !  'X V  # ! 
Y }Z }Z   G !
V       & !
UN #y    !
% #  R&     #y X%   #
0

Y }Z Z V   X V  # !  


R   ! # N #   !
'X V  # !   #y& Q, %+ 
V  $ ! #Q
T"$#    &#)(#   + &
++  G   # !   RS,#   X%    # !
L@ #  X    #
Y }Z 1 V   |+  V  # !  
V   |+  V*  # !  L X X!
++  G   # !   #%  W X   Q%  !
L@ #  X    #
hjik,lmGnoUpq sI{o.k,mGsIHukHuOsI.sIHusuunGsInCusI,uk,x@nCsIk
Y Z Y V  R X!   #  # N $  @ X%    # !
V  $  @ X%    #
     &#%  # Ny| #%TV@T %   
)#    #% X%! X # N% ( L@ # N    # !
V  $    N t#%
V   X # N T#  % X   #
V  )# %   # X%! $  @ X    #
V     L      Q% 
Y Z }Z V     X    #
   #  # N t ! #  #  
Y Z Z N &KQ,G ! X   #       X%    #
% X   # ! #y X # N  N #   !
Y Z 1 /$X T    N%!
#%R,#   !
/X    X   #   N%!   $   X    #
/X    X   #   N%!      X%    #
hjik,lmGnoUpq winUrtk,lxIkHCnUKk,m.sI,u
Y\1 Y V   |+   X   #y#O    #%
L !!      # *| &#%      R,#  X   # ! #O    #%
L@%     !   !
Y\1 }Z   +%     #    Q  !
   L@  N &#    !
Y\1 Z V   G# ! X #    #
V     %Ny    !
)#    #% X%! X # N% ( W X   Q%  !
Y\1 1 N &# ! # NO &#% X #   +  !
"# X # !!
N' # ! # N X # N  ( L # N &   # !
'!  RS%#%& # !! # N   RS  #   + &
)#    #% X%! X # N% ( L@ # N    # !
Y\1 0 #  T# !
#% G# ! # N V*   # X    #
`

V  
 # T# X%   # !
!! KQ% &#%T   T# !
V   L   &#        # N    X # N% ( L@ # N   # !
)#    #% X%! X # N% ( L@ # N    # !
#    +     Q%  !
hjik,lmGnoUpq hw,lxIn.vuk,xI.sI
Y\0 Y L@  + G    Q%  !
  +  !L  + & X #     # !
Y\0 }Z L@  + GjRS,#     #
    !t # N $#  (  )(
R,#     #
V   X # N T#  V    
Y\0 Z    #%  X #    # !
++  G   # !    X  Ny  % # !
Y\0 1 L@ #    +%+ &#%
#  &(  +%!
| #  @Z    #  V  # !      # !
L@ #    )(
L@  +,!   #  $#  ( '++ # !
V  tG  #%# U+%+ #V    
## X  R    # !
++  G   # !      #%  X #     # ! # Ny|'+   ! X%   #
++  G   # !    X  N W 
V     !!^ #O X%   #  # N   # ! X #     #
Y\0 0 L@  + Gj#      #
|     # NOL  X     #
Y\0 ` L@X ( ! )#    X    # N V   L@ X X%! t !  N%X  !
L@X% (H ! )#     X 
R   ! Q()#    #
  X #   + &      #  X #     # !
V   L X X! t !  NX  !
 & ! # Ny  !
|X #   !
V  t !  N%X V    
  X   #  t  #   !
hjik,lmGnoUpq Ouk,sIj,$KxIk,uk,ownsIk
Y\` Y R& X!   #y#O  #     N  
R    #    R& X!   #y X%    #
  ] N' #   X     #
Y\` }Z  X    #OV  # X  !t R X%!   #O X   # !
X%    + G  !
)#    #% X%! X # N% ( L@ # N    # ! # N    #%
Y\` Z +   & % X    # !      X%   #

  #%  t    #' &


  #%  R !P

Y\` 1  &   &)(  # Ny (T (   N%!


Y\` 0 V   X # N% G#   X    #
V   X # N T#  % X   #
Y\` ` V    #     X%    #
Y\` Z $#  (  % X    # !'      X   #
 Q    #  t    # X  R X 
 Q    # TL  X  R X 
% #  # NO (TT (
Y\` Z /$N% L@X ' !
L  X  R X  !
t    #% X  @RS X  !
Y\` _ V       N   X%    #
 N    #
 N X # N% ( L@ # N &   # !
  N      #  # N 'X # N ( L # N   # !
hjik,lmGnoUpq k,o.m.sIk,x@OsInoGnumGsk,xKk,m.sI,usIulk,Cn
Y Z Y V   |+    # N   !!^ #O X   # !
Y Z }Z   +%     #    Q  !
|+ '  ! X    #y#   
Y Z Z V   G# ! X #    #
V  # ! X #     #  #O  # +% 
X # N  N R    # ! # N      Ny )   !
Y Z 1 V     X   #y#OV   ] RS# !   #    N  
  #%   
V   X # N T#  % X   #y    $  @ X%    #
Y Z 0 V     X    #y&#yV    ] RST# !   #    N  
 Q    # !*  &
Y Z ` +%  G   ! # N X (# ! #   + &
X ('G# !  &#   + 
V     %Ny RS ! #
hjik,lmGnoUpq ,uxIsIunk,ow.m.nC
Y Z Y )    #  X #    # !
%  @ X #    # !
X%N     L #'#  
      Z X  N )    #
Y Z }Z  X    # ' X   # !
%  @ X%   # !
V   !      #y$    &  
 N  #   %N%!
/   # !  %N
 ( !  ! X%   # !

X   V@ P!


Y Z Z  #%&M    #
V   Q    X #     #
V   N #
L     &# !
V      # N RS  V ! 
 N &#@R ! #
L@ # X    'N # !
hjik,lmGnoUpq ,uxIsIunk,owoGsIukHoGOsInoGnum.sIk,x@k,mGsIHu
Y\_ Y  !   N   ( !  !'  N #  (R&
G #   X%    # !
  @  N   ( !  !
%  @  N #  (R,#  X    # !
Y\_ }Z  !  G#   - "$#%& X # !!.-  # NOL@ # # X%X%! RS + G# N #  
 ! #  
"# X # !!
L@ # # X%X%! RS + # N #  
Y\_ Z    Q% &)(  X  &Q% 
   Q ( *%  @RS,#   X%   # !
| #  M     #  # Ny   Q% &)(
L@ # !     ( !   !
 T   #  #  # N   !^! #  ( !  !
| ( +%X #   X #    # !
Y\_ 1 /$X T  % X    #   %N!
X &G !  %N
V  (     N%!
| V* X #    #y  
#y X &  #)#   X    #
X #% ZX  T  N%!
 URS, G#   X%    # !
hjik,lmGnowHq winUhjk,xICxj,$kHoGsIk,m.sI,u
} [ Y   + & !'*     #    Q% G !
} [ }Z V     + & !        #    Q 
V   !       #
V     !     #  Q% 
V      # N RS  V ! 
} [ Z X  ]N &# !   #       #    Q% & !
} [ 1 /$X T    N%!       #    Q% & !
#% G# !
/ #% &#%   %  #
hjik,lmGnowWp,q ,uxIsIunk,owSk,o.mGsk,x@OsInCo.nCumGsIk,x@Kk,m.sI,u
} Y Y %P! # N & ! @  N      RS
G #   X%    # !
/$ # #  @  ' X%   #
_

} Y }Z /$ # #  @RS X%!   #


X   ! X%   #
} Y Z R !+  !   #
V      ZN     ! X%   #
} Y 1 L@ #  X%!   #  # N  #  (  '
vllnusvqnCmGHowhwkHxICxIsIujsInCusI,u
Y  #% L@X   !
}Z  #  @R    # !
Z       N%!
1  N #  # NyLX 
0 )#  ! # L@X  !
 | #% 
 | #%  )#'  !
| #)#  !*     @& N%!
 X | #%#   !
` R 'X Q% #   !
Z # ! V    
# !   X 
vllnusOqnCm.,owhjk,xICxIsIuwio.nCnUsInCusI,u
S Y R    # NyL  !!  NX% 
S }Z L@X ' !
S Z | #%)#   !
 | #% 
| #)#  !*     @& N%!
S 1 X    !
V  #%# !  T%X    !
S 0 X   )#   !
%X  '  
 X )#  !
S `  X T)#  !
L@% #%    Q%  !
S Z  N # - R&'#   -  # NyLX 
V   N #
R#    # NOL@X 
)#   ##%    # ! # NOL@ #%#    N #% !!

S Z V  X # N% T#  )#    #yV     !
V   X # N T#  V       | &#%)#  !
  'P  ! V    
V  RS #  V     
S _ )#    #yQ(   ! # N N # !
S Y[        #%  V    (
vllnushUqno.sInC
Y[

L Y     & !


V  (  . ! V    
L }Z |'X #   & !
L Z %+     X #    # !
V      X #     #
  ! X   # !  N &#  (jRS,#   X%   # !
t X  @  # !
V  $( X   #
V   | # N  X   #
t X    # X  @  # !
 !^!  ! X%    #
vllnusqjinU,o.k,uhjk,u,usIk,x@,o.

YY

Chapter 1
Linear Algebra
Linear algebra is motivated by the need to solve systems of linear algebraic equations
in a finite number of variables. It is the foundation upon which almost all of applied
mathematics rests. This is not to say that nonlinear equations are not important; rather, it
means that one cannot make any progress in understanding the more complicated nonlinear
regime before one has a firm grasp of the fundamentals of linear systems.
The basic concepts of matrix, vector and scalar systematize the study of linear systems. The principal solution algorithm is known as Gaussian elimination, and is one of
the most important techniques in applied (as well as theoretical) mathematics. In this
chapter, we begin with the study of linear systems involving the same number of equations
as unknowns. We then introduce the basic arithmetic operations for matrices and vectors. A key observation is that Gaussian elimination is equivalent to a particular matrix
factorization, known as the (permuted) L U decomposition. The decomposition provides
additional insight into the solution algorithm, and will be repeatedly used in a variety of
applications appearing in the subsequent chapters. We shall also discuss some practical
issues and limitations in computer implementations of the Gaussian elimination method
for large systems arising in applications.
We also present the basics of matrix inverses and determinants. However, both of
these classical linear algebra topics are of much less importance in applied mathematics, in
contrast to their usual central role in more theoretically-oriented treatments of the subject.
Nevertheless, we will have occasion to make use of both inverses and determinants, and so
need to learn to work with them.

1.1. Solution of Linear Systems.


Gaussian elimination is a simple, systematic approach to the solution of systems of
linear equations. It is the workhorse of linear algebra, and as such of absolutely fundamental importance in applied mathematics. In this section we review the method in the
most important case in which there are the same number of equations as unknowns. More
general situations will be considered in Section 1.8.
Consider an elementary system of three linear equations
x + 2 y + z = 2,
2 x + 6 y + z = 7,

(1.1)

x + y + 4 z = 3,
3/7/03

c 2003

Peter J. Olver

in three unknowns x, y, z. Linearity refers to the fact that the unknowns only appear to
the first power in the equations. The basic solution method is to systematically employ
the following fundamental operation:
Linear System Operation #1 : Add a multiple of one equation to another equation.
Before continuing, the reader should convince themself that this operation does not
change the solutions to the system. As a result, our goal is to judiciously apply the
operation and so be led to a much simpler linear system that is easy to solve, and, moreover
has the same solutions as the original. Any linear system that is derived from the original
system by successive application of such operations will be called an equivalent system.
By the preceding remark, equivalent linear systems have the same solutions.
The systematic feature is that we successively eliminate the variables in our equations
in order of appearance. Thus, to eliminate the first variable x from the second equation,
we subtract twice the first equation from the second, leading to the equivalent system
x + 2 y + z = 2,
2 y z = 3,

(1.2)

x + y + 4 z = 3.

Next, we eliminate x from the third equation by subtracting the first equation from it; the
resulting system is
x + 2 y + z = 2,
2 y z = 3,
(1.3)
y + 3 z = 1.

The equivalent system (1.3) is already simpler than the original system (1.1). Notice that
the second and third equations do not involve x (by design) and so constitute a system
of two linear equations for two unknowns. Moreover, once we have solved this subsystem
for y and z, we can substitute the answer into the first equation, and we need only solve
a single linear equation for x.
We continue on in this fashion, the next phase being the elimination of the second
variable y from the third equation, which is done by adding 12 the second equation to it.
The resulting system is
x + 2 y + z = 2,
2 y z = 3,
5
5
2 z = 2.

(1.4)

This is the simple system we are after. It is in what is called triangular form, which means
that, while the first equation involves all three variables, the second equation only involves
the second and third variables, and the last equation only involves the last variable. Any
triangular system can be straightforwardly solved by the method of Back Substitution.

Also, there are no product terms like x y or x y z. The official definition of linearity will
appear in Chapter 7.

3/7/03

c 2003

Peter J. Olver

Namely, we work backwards, solving the last equation first, which gives z = 1. We substitute this result back into the next to last equation, which becomes 2 y 1 = 3, with
solution y = 2. We finally substitute these two values for y and z into the first equation,
which becomes x + 5 = 2, and so the solution to the triangular system (1.4) is
x = 3,

y = 2,

z = 1.

(1.5)

Moreover, since we only used our basic operation to pass from (1.1) to the triangular
system (1.4), this is also the solution to the original system of linear equations. Thus, the
system (1.1) has a unique meaning one and only one solution, namely (1.5).
And that, barring a few extra issues that can crop up from time to time, is all that
there is to the method of Gaussian elimination! It is very simple, but its importance
cannot be overemphasized. Before discussing the possible complications, it will help to
reformulate our method in a more convenient matrix notation.

1.2. Matrices and Vectors.


A matrix is a rectangular array of numbers . Thus,

1
1 0 3
e
2 ,
( .2 1.6 .32 ),
,

2 4 1
1 .83

5 74
are all examples of matrices. We use the notation
a
a
... a
11

12


0
,
0

1
2

3
,
5

1n

a21 a22 . . . a2n


(1.6)
A=
..
..
..

..
.
.
.
.
am1 am2 . . . amn
for a general matrix of size m n, where m denotes the number of rows in A and n denotes
the number of columns. Thus, the preceding examples of matrices have respective sizes
2 3, 4 2, 1 3, 2 1 and 2 2. A matrix is square if m = n, i.e., it has the same
number of rows as columns. A column vector is a m 1 matrix, while a row vector is a
1 n matrix. As we shall see, column vectors are by far the more important of the two,
and the term vector without qualification will always mean column vector. A 1 1
matrix, which has but a single entry, is both a row and column vector.
The entry of A that lies in the ith row and the j th column, known as the (i, j) entry of
A, is denoted by aij . Two matrices are equal, A = B, if and only if they have the same
size, and all their entries are the same: aij = bij .

For the time being, we shall only use real numbers, but eventually we need to analyze matrices
with complex entries.
In tensor analysis, [ Tensor ], a sub- and super-script notation is adopted, a ij denotes what
we will call aij . This has certain advantages, but, for simplicity, we shall stick with subscript
notation throughout this text.

3/7/03

c 2003

Peter J. Olver

A linear system of m equations in n unknowns will take the form


a11 x1 + a12 x2 + + a1n xn = b1 ,

a21 x1 + a22 x2 + + a2n xn


..
.
am1 x1 + am2 x2 + + amn xn

= b2 ,
..
.
= bm .

(1.7)

As such, it has three basic constituents: the m n coefficient matrix A, which is given in
(1.6), along with the two column vectors
b
x
1
1
b2
x2

(1.8)
and
b=
x=
.. ,
..
.
.
bm

xn

which form the vector of unknowns and the right hand side of the system. These three
ingredients, A, x, b, uniquely characterize the system. In our example, (1.1),



1 2 1
x
2

A= 2 6 1 ,
x= y ,
b = 7.
(1.9)
1 1 4
z
3
Remark : We will consistently use bold face lower case letters to denote vectors, and
capital letters to denote general matrices.
Basic Matrix Arithmetic
There are three basic operations of matrix arithmetic: matrix addition, scalar multiplication, and matrix multiplication. First we define addition of matrices. Two matrices of
the same size can be added, and matrix addition is performed entry by entry. Therefore,
if A and B are m n matrices, their sum C = A + B is the m n matrix whose entries
are given by cij = aij + bij . For example,

1 2
3 5
4 3
+
=
.
1 0
2 1
1 1

Matrix addition is commutative, A+B = B+A, and associative, A+(B+C) = (A+B)+C,


just like ordinary addition.
A scalar is a fancy name for an ordinary number the term merely distinguishes it
from a vector or a matrix. For the time being, we will only be considering real scalars,
and matrices with real entries, although later, complex scalars and complex matrices will
show up. We will often identify a scalar c R with the 1 1 matrix ( c ) in which it is
the sole entry. If c is a scalar and A an m n matrix, then B = c A is the m n matrix
obtained by multiplying each entry of A by c, so bij = c aij . For example,

3 6
1 2
.
=
3
3 0
1 0
3/7/03

c 2003

Peter J. Olver

Basic properties of scalar multiplication appear in the table at the end of this section.
Finally, we define matrix multiplication. First, the product between a row vector a
and a column vector x having the same number of entries is the scalar defined by the
following rule:
x
1

n
X
x2

ak xk .
a x = ( a 1 a2 . . . a n )
=
a
x
+
a
x
+

+
a
x
=
1 1
2 2
n n
..
.
k=1
xn

More generally, if A is an m n matrix and B is an n p matrix, so that the number of


columns in A equals the number of rows in B, then the matrix product C = A B is defined
as the m p matrix whose (i, j) entry equals the vector product of the ith row of A and
the j th column of B. Therefore,
cij =

n
X

aik bkj .

k=1

Note that our restriction on the sizes of A and B guarantees that the rows and columns
will have the same number of entries, and so their product is defined.
For example, the product of the coefficient matrix and vector of unknowns for our
original system (1.1) is given by


x + 2y + z
1 2 1
x
A x = 2 6 1 y = 2 x + 6 y + z .
x + y + 4z
1 1 4
z
The result is a column vector whose entries reproduce the left hand sides of the original
linear system! As a result, we can rewrite the system in the matrix form
Ax = b

(1.10)

as an equality between two vectors. This result is general; a linear system (1.7) consisting
of m equations in n unknowns can be written in the matrix form (1.10) where A is the
m n coefficient matrix (1.6), x is the column vectors of unknowns, and b is the column
vector containing the right hand sides, (1.8). This is the reason why we define matrix
multiplication as we do, and not by componentwise multiplication. The latter operation
turns out to be almost completely useless.
Now, the bad news. Matrix multiplication is not commutative. For example, BA may
not be defined even when A B is. Even if both are defined, they may be different sized
matrices. For example the product of a row vector r, a 1 n matrix, and a column vector
c, an n 1 matrix, is a 1 1 matrix or scalar s = r c, whereas the reversed product C = x r
is an n n matrix. For example,



3 6
3
3
.
(1 2) =
= 3,
whereas
(1 2)
0 0
0
0
3/7/03

c 2003

Peter J. Olver

In computing the latter product, dont forget that we multiply the rows of the first matrix
by the columns of the second. Even if A B and B A have the same size, which requires
both A and B to be square matrices, we may still have A B 6= B A. For example,

1 2
0 1
3 4
2 5
0 1
1 2
.
=
6=
=
3 4
1 2
5 6
4 11
1 2
3 4
On the other hand, matrix multiplication is associative, so
A (B C) = (A B) C
whenever A has size m n, B has size n p and C has size p q; the result is a matrix of
size m q. Consequently, matrix algebra works much like ordinary algebra as long as one
is careful not to change the order of multiplicative factors without proper justification.
Remark : Since matrix multiplication is rows times columns, one can compute the
individual columns in a matrix product C = A B by multiplying A times the columns of
B. Thus the k th column of C is equal to the product of A with the k th column of B.
Explicitly,

(1.11)
A B = A b1 b2 . . . b p = A b 1 A b 2 . . . A b p .
For example, the two columns of the matrix product

3 4

1 4
1 1 2

0 2 =
8 6
2 0 2
1 1

are obtained by multiplying the first matrix with the individual columns of the second:

4
1 1 2
1
1 1 2

.
2 =
,
=
0
6
2 0 2
8
2 0 2
1
1

There are two important special matrices. The first is the zero matrix of size m n,
denoted Omn or just O if the size is clear from context. It forms the additive unit, so
A + O = A = O + A for any matrix A of the same size. The role of the multiplicative unit
is played by the square identity matrix

1 0 0 ... 0
0 1 0 ... 0

0 0 1 ... 0
I = In =
. . .
..
..
.. .. ..
.
.
0

0 0

...

of size n n. The entries of I along the diagonal (which runs from top left to bottom
right ) are equal to 1; the off-diagonal entries are all 0. As the reader can check, if A is

We will only use the term diagonal when the matrix is square.

3/7/03

c 2003

Peter J. Olver

any m n matrix, then Im A = A = A In . We will sometimes write the last equation as


just I A = A = A I ; even though the identity matrices can have different sizes, only one
size is valid for the matrix product to be defined.
Let us conclude this section by summarizing the basic properties of matrix arithmetic.
In the following table, A, B, C are matrices, c, d scalars, O is a zero matrix, and I is an
identity matrix. The matrices must have the correct sizes for the indicated operations to
be defined.
Basic Matrix Arithmetic
Commutativity Matrix Addition

A+B =B+A

Associativity Matrix Addition


Zero Matrix Matrix Addition

(A + B) + C = A + (B + C)
A+O=A=O+A

Associativity Scalar Multiplication

c (d A) = (c d) A

Additive Inverse
Unit Scalar Multiplication

A + ( A) = O, A = (1)A
1A=A

Zero Scalar Multiplication


Distributivity Matrix Addition

0A=O
c (A + B) = (c A) + (c B)

Distributivity Scalar Addition


Associativity Matrix Multiplication

(c + d) A = (c A) + (d A)
(A B) C = A (B C)

Identity Matrix
Zero Matrix Matrix Multiplication

A I = A = IA
AO = O = OA

1.3. Gaussian Elimination Regular Case.


With the basic matrix arithmetic operations in hand, let us now return to our main
subject solving linear systems. Our goal is to develop a systematic algorithm that will
solve all linear systems. We begin by replacing the system (1.7) by its matrix constituents.
It is convenient to ignore the vector of unknowns, and form the augmented matrix

a
a12 . . . a1n b1
11

a21 a22 . . . a2n b2


(1.12)
M = A|b =
.. ..
..
..

..
.
. .
.
.

am1 am2 . . . amn


bn
which is an m (n + 1) matrix obtained by tacking the right hand side vector onto the
original coefficient matrix. The extra vertical line is included just to remind us that the
last column of this matrix is special. For example, the augmented matrix for the system
(1.1), i.e.,

x + 2 y + z = 2,
1 2 1 2
2 x + 6 y + z = 7,
is
M = 2 6 1 7 .
(1.13)
3
1
1
4
x + y + 4 z = 3,
3/7/03

c 2003

Peter J. Olver

Note that one can immediately recover the original linear system from the augmented
matrix. Since operations on equations also affect their right hand sides, keeping track of
everything is most easily done through the augmented matrix.
For the time being, we will exclusively look at linear systems containing the same
number, n, of equations as unknowns. The associated coefficient
matrix A is square, of

size n n. The corresponding augmented matrix M = A | b then has size n (n + 1).


The more general situation of m equations in n unknowns will be deferred until Section 1.8.
The matrix operation that assumes the role of Linear System Operation #1 is:
Elementary Row Operation #1 :
Add a scalar multiple of one row of the augmented matrix to another row.
For example, if we add 2 times the first row of the augmented matrix (1.13) to the
second row, the result is the row vector
2 ( 1 2 1 2 ) + ( 2 6 1 7 ) = ( 0 2 1 3 ).
The result can be recognized as the second

1 2
0 2
1 1

row of

1
1
4

the modified augmented matrix

2
(1.14)
3
3

that corresponds to the first equivalent system (1.2). When the elementary row operation
#1 is performed, it is critical that the result replace the row being added to not the
row being multiplied by the scalar.
Notice that the elimination of a variable in an equation in this case, the first variable
in the second equation amounts to making its entry in the coefficient matrix equal to
zero. We shall call the (1, 1) entry of the coefficient matrix the first pivot. The precise
definition of pivot will become clear as we continue; the one key requirement is that a pivot
be nonzero. Eliminating the first variable x from the second and third equations amounts
to making all the matrix entries in the column below the pivot equal to zero. We have
already done this with the (2, 1) entry in (1.14). To make the (3, 1) entry equal to zero,
we subtract the first row from the last row. The resulting augmented matrix is

1 2
1 2
0 2 1 3 ,
(1.15)

0 1 3
1

which corresponds to the system (1.3). The second pivot is the (2, 2) entry of this matrix,
which is 2, and is the coefficient of the second variable in the second equation. Again, the
pivot must be nonzero. We use the elementary row operation of adding 21 of the second
row to the third row to make the entry below this second pivot equal to 0; the result is
the augmented matrix

1 2 1 2
(1.16)
N = 0 2 1 3 .
5
5

0 0 2
2
3/7/03

c 2003

Peter J. Olver

Gaussian Elimination Regular Case


start
for j = 1 to n
if mjj = 0, stop; print A is not regular
else for i = j + 1 to n
set lij = mij /mjj
add lij times row j of M to row i of M
next i
next j
end

that corresponds to the triangular system (1.4). We write the final augmented matrix as

1
2
1
2

N = U |c ,
where
U = 0 2 1 ,
c = 3 .
5
5
0 0 2
2

The linear system (1.4) corresponding to (1.16) has vector form


U x = c.

(1.17)

The coefficient matrix U is upper triangular , which means that all its entries below the
main diagonal are zero. The nonzero entries on the diagonal, 1, 2, 25 , including the last
one in the (3, 3) slot are the three pivots. Once the system has been reduced to triangular
form (1.17), we can easily solve it, as discussed earlier, by back substitution.
The preceding algorithm for solving a linear system is known as regular Gaussian
elimination in honor of one of the all-time mathematical greats the nineteenth century
German mathematician Karl Friedrich Gauss. A square matrix A will be called regular if
the algorithm successfully reduces it to upper triangular form U with all non-zero pivots
on the diagonal. In other words, for regular matrices, we identify each successive nonzero
entry in the diagonal position as the pivot, then eliminate all the nonzero entries in the
column below the pivot equal to zero through elementary row operations of Type #1.
Any system with regular coefficient matrix can be solved by first reducing the augmented
matrix to upper triangular form and then solving the resulting triangular system by back
substitution.
Let us state this algorithm in the form of a program, written in a general pseudocode
that can be easily translated into a specific language, e.g., C++, Fortran, Maple,
Mathematica or Matlab. We use a single letter M = (mij ) to denote the current

augmented matrix at each stage in the computation, and initialize M = A | b . Note


that the entries of M will be changing as the algorithm progresses. The final
output of
the program, assuming A is regular, is the augmented matrix M = U | c , where U is
3/7/03

c 2003

Peter J. Olver

the upper triangular matrix U whose diagonal entries, uii = mii are the pivots and c is
the vector of right hand sides obtained after performing the elementary row operations.
Elementary Matrices
Elementary row operations can, in fact, be realized by matrix multiplication.
Definition 1.1. The elementary matrix E associated with an elementary row operation for m n matrices is the m m matrix obtained by applying the row operation to
the identity matrix.
For example, applying the elementary row operation
that
adds 2 times the first row

1 0 0
to the second row of the identity matrix I = 0 1 0 results in the corresponding
0 0 1

1 0 0

elementary matrix E1 = 2 1 0 . We claim that, if A is any 3 n matrix, then


0 0 1
multiplying A on the left by E1 to form the matrix product E1 A has the same effect as
the given elementary row operation. In particular, the augmented matrix (1.14), which
was obtained from the augmented matrix M in (1.13) by such a row operation, is equal to
the product

1 0 0
1 2 1 2
1 2 1 2
E1 M = 2 1 0 2 6 1 7 = 0 2 1 3 ,
0 0 1
1 1 4 3
1 1 4 3
as the reader can
If we set

E1 = 2
0

verify.

0 0
1 0,
0 1

1 0 0
E2 = 0 1 0 ,
1 0 1

E3 = 0
0

0
1
1
2

0
0,
1

(1.18)

then multiplication by E1 will subtract twice the first row from the second row, multiplication by E2 will subtract the first row from the third row, and multiplication by E 3 will
add 21 the second row to the third row precisely the row operations used to place our
original system in triangular form. Therefore, performing them in the correct order (and
using the associativity of matrix multiplication), we conclude that when

1 2 1
1 2 1
then
E 3 E2 E1 A = U = 0 2 1 .
(1.19)
A = 2 6 1,
5
0 0 2
1 1 4
The reader should check this by directly multiplying the indicated matrices.
In general, then, the elementary matrix E of size mm will have all 1s on the diagonal,
and a single nonzero entry c in position (i, j), where i 6= j. If A is any m n matrix, then
the matrix product E A is equal to the matrix obtained from A by the elementary row
operation adding c times row j to row i. (Note the reversal of order of i and j.)
3/7/03

10

c 2003

Peter J. Olver

The elementary row operation that undoes the operation of adding c times row j to
row i is the inverse operation that subtracts c (or, equivalently, adds c) times row j from
row i. The corresponding inverse elementary matrix again has 1s along the diagonal, but
has c in the (i, j) slot. Let us denote the inverses of the particular elementary matrices
(1.18) by Li , so that, according to our general rule,

1 0

L1 = 2 1
0 0

0
0,
1

Note that the product

1 0

L2 = 0 1
1 0

0
0,
1

L3 = 0
0

0
1
12

0
0.
1

Li Ei = I

(1.20)

(1.21)

is the 3 3 identity matrix, reflecting the fact that these are inverse operations. (A more
thorough discussion of matrix inverses will be postponed until the following section.)
The product of these three matrices is equal to

L = L 1 L2 L3 = 2
1

0
1
21

0
0.
1

(1.22)

The matrix L is called a special lower triangular matrix, which means that all the entries
above the main diagonal are 0 and all the entries on the diagonal are equal to 1. The entries
of L below the diagonal are the same as the corresponding nonzero entries in the L i ! This
is a general fact, that holds when the lower triangular elementary matrices are multiplied
in the correct order. (For instance, the product L3 L2 L1 is not so easily predicted.) More
generally, the following elementary consequence of the laws of matrix multiplication will
be used extensively.
b are lower triangular matrices of the same size, so is their
Lemma 1.2. If L and L
b If they are both special lower triangular, so is their product. Similarly, if
product L L.
b are (special) upper triangular matrices, so is their product U U
b.
U, U
The L U Factorization

We have almost arrived at our first important result. Consider the product of L and
U in (1.19), (1.22). Using equation (1.21), along with the basic property of the identity
matrix I , and the associativity of matrix multiplication, we conclude that
L U = (L1 L2 L3 )(E3 E2 E1 A) = L1 L2 (L3 E3 )E2 E1 A = L1 L2 I E2 E1 A
= L1 (L2 E2 )E1 A = L1 I E1 A = L1 E1 A = I A = A.
In other words, we have factorized the coefficient matrix A = L U into a product of a
special lower triangular matrix L and an upper triangular matrix U with the nonzero
pivots on its main diagonal. The same holds true for almost all square coefficient matrices!
3/7/03

11

c 2003

Peter J. Olver

Theorem 1.3. A matrix A is regular if and only if it admits an L U factorization


A = L U,

(1.23)

where L is special lower triangular matrix, having all 1s on the diagonal, and U is upper
triangular with nonzero diagonal entries, which are the pivots for A. The nonzero offdiagonal entries, lij for i > j, of L prescribe the elementary row operations that bring
A into upper triangular form; namely, one subtracts lij times row j from row i at the
appropriate step of the Gaussian elimination process.
Example 1.4. Let us compute the L U factorization of the matrix

2 1 1
A = 4 5 2.
2 2 0

Applying the Gaussian elimination algorithm, we first subtract twice the first row from
the
second row,
and then subtract the first row from the third. The result is the matrix
2 1
1
0 3
0 . The next step adds the second row to the third row, leading to the upper
0 4 1
triangular matrix

2 1 1
U = 0 3 0 .
0 0 1
The corresponding lower triangular matrix is

1 0 0
L = 2 1 0,
1 1 1

whose entries below the diagonal are the negatives of the multiples we used during the
elimination procedure. Namely, the (2, 1) entry of 2 indicates that we added 2 times the
first row to the second row; the (3, 1) entry of 1 indicates that we added 1 times the first
row to the third; and, finally, the (3, 2) entry of 1 indicates that we added the second
row to the third row at the appropriate stage of the procedure. The reader might wish to
verify the factorization A = L U , or, explicitly,

2 1 1
1 0 0
2 1 1
4 5 2 = 2 1 00 3 0 .
(1.24)
2 2 0
1 1 1
0 0 1
Forward and Back Substitution
Once we know the L U factorization of a matrix A, we are able to solve any associated
linear system A x = b in two stages:
(1) First solve the lower triangular system
Lc = b
3/7/03

12

(1.25)
c 2003

Peter J. Olver

for the vector c by forward substitution. This is the same as back substitution,
except one solves the equations for the variables in the opposite order from first
to last. Explicitly, we compute
X
ci = bi
lij cj ,
for
i = 1, 2, . . . , n 1, n,
(1.26)
j<i

noting that the previously computed values of c1 , . . . , ci1 are used to determine
the next ci .
(2) Second, one solves the upper triangular system
Ux=c

(1.27)

by back substitution. Explicitly, the values of the unknowns

X
1
xi =
ci
uij xj ,
for
i = n, n 1, . . . , 2, 1,
uii
j>i

(1.28)

are successively computed, but now in reverse order.


Note that this algorithm does indeed solve the original system, since if
Ux=c

and

L c = b,

then

A x = L U x = L c = b.

Once we have found the L U factorization of the coefficient matrix A, the Forward and
Back Substitution processes produce the solution quickly. They are easily implemented on
a computer.
Example 1.5. With the

2 1
4 5
2 2

L U decomposition


1 0 0
2 1
1
2 = 2 1 00 3
1 1 1
0 0
0

1
0
1

found in Example 1.4 in hand, we can readily solve any linear system with the given
coefficient matrix by Forward and Back Substitution. For instance, find the solution to


1
2 1 1
x
4 5 2y = 2,
2
2 2 0
z
we first solve the lower triangular system


1
1 0 0
a
2 1 0 b = 2.
2
1 1 1
c

The first equation says a = 1; substituting into the second, we find b = 0; the final equation
gives c = 1. We then solve the upper triangular system


1
a
2 1 1
x
0 3 0 y = b = 0.
1
c
0 0 1
z
3/7/03

13

c 2003

Peter J. Olver

In turn, we find z = 1, then y = 0, then x = 1, which gives the unique solution to the
original system.
Of course, if we are not given the L U factorization in advance, we can just use direct
Gaussian elimination on the augmented matrix. Forward and Back Substitution is useful
if one has already computed the factorization by solving for a particular right hand side
b, but then later wants to know the solutions for alternative bs.

1.4. Pivoting and Permutations.


The method of Gaussian elimination presented so far applies to regular matrices.
Not every square matrix is regular; a simple class of examples are matrices whose upper
left entry is zero, and so cannot serve as the first pivot. More generally, the regular
elimination algorithm cannot proceed when a zero entry appears in the current pivot spot
on the diagonal. Zero can never serve as a pivot, since we cannot use it to eliminate any
nonzero entries in the column below it. What then to do? The answer is revisit the source
of our algorithm.
For example, consider the linear system
3 y + z = 2,
(1.29)

2 x + 6 y + z = 7,
x + 4 z = 3.
The augmented coefficient matrix is

0
2
1


3 1 2
6 1 7 .
0 4 3

(1.30)

In this case, the (1, 1) entry is 0, and so cannot serve as a pivot. The problem, of course,
is that the first variable x does not appear in the first equation, and so we cannot use it
to eliminate x in the other two equations. But this problem is, actually, a benefit we
already have an equation with only two variables in it, and so only need to eliminate x
from one of the other two equations. To be systematic, we rewrite the system in a different
order,
2 x + 6 y + z = 7,
3 y + z = 2,
x + 4 z = 3,
by interchanging the first two equations. In other words, we use
Linear System Operation #2 : Interchange two equations.
Clearly this operation does not change the solution, and so produces an equivalent
linear system. In our case, the resulting augmented coefficient matrix is

2 6 1 7
0 3 1 2,
(1.31)

1 0 4
3
3/7/03

14

c 2003

Peter J. Olver

and is obtained from the original one by performing the second type of elementary row
operation:
Elementary Row Operation #2 : Interchange two rows of the matrix.
The new nonzero upper left entry, 2, can now serve as the first pivot, and we may now
continue to apply elementary row operations of Type #1 to reduce our matrix to upper
triangular form. For this particular example, we eliminate the remaining nonzero entry in
the first column by subtracting 12 the first row from the last:

2 6 1 7
0 3 1 2 .

0 3 72 12

The (2, 2) entry of 3 serves as the next pivot. To eliminate the nonzero entry below it, we
add the second to the third row:

2 6 1 7
0 3 1 2 .

0 0 92 23
We have now placed the system in upper triangular form, with the three pivots, 2, 2,
along the diagonal. As before, back substitution produces the solution
x = 53 ,

y = 59 ,

z = 31 .

9
2

(1.32)

The row interchange that is required when a zero shows up on the diagonal in pivot
position is known as pivoting. Later, in Section 1.7, we shall discuss practical reasons for
pivoting even when the diagonal entry is nonzero. The coefficient matrices for which the
Gaussian elimination algorithm with pivoting produces the solution are of fundamental
importance.
Definition 1.6. A square matrix is called nonsingular if it can be reduced to upper
triangular form with all non-zero elements on the diagonal by elementary row operations
of Types 1 and 2. Conversely, a square matrix that cannot be reduced to upper triangular
form because at some stage in the elimination procedure the diagonal entry and all the
entries below it are zero is called singular .
The revised version of the Gaussian Elimination algorithm, valid for all nonsingular
coefficient matrices, is implemented
by the following program. The starting point is the

augmented matrix M = A | b representing the linear system A x = b. After successful


termination
of the program, the result is an augmented matrix in upper triangular form

M = U | c representing the equivalent linear system U x = c. One then uses Back


Substitution to determine the solution x to the linear system.
Every regular matrix is nonsingular, but, as we just saw, the converse statement is not
valid. A key fact is that uniqueness of solutions is a defining characteristic of nonsingularity.
Theorem 1.7. A linear system A x = b has a unique solution for every choice of
right hand side b if and only if its coefficient matrix A is square and nonsingular.
3/7/03

15

c 2003

Peter J. Olver

Gaussian Elimination Nonsingular Case


start
for j = 1 to n
if mkj = 0 for all k j, stop; print A is singular

if mjj = 0 but mkj 6= 0 for some k > j, switch rows k and j


for i = j + 1 to n
set lij = mij /mjj
add lij times row j to row i of M

next i
next j
end

We are able prove the if part of this theorem, since nonsingularity implies reduction
to an equivalent upper triangular form that has the same solutions as the original system,
and so the unique solution to the system is found by back substitution. The only if part
will be proved in Section 1.8.
Permutation Matrices
There is a factorization of nonsingular matrices that are not regular, and so require
row interchanges, which is analogous to the L U decomposition in the regular case. To find
it, we need to understand row interchanges, i.e., pivoting, in more depth.
As with the first type of elementary row operation, row interchanges can be accomplished by multiplication by a second type of elementary matrix. Again, the elementary
matrix is found by applying the row operation in question to the identity matrix. For
instance, interchanging rows 1 and 2 of the identity produces the elementary interchange
matrix

0 1 0
P = 1 0 0.
0 0 1

As the reader can check, the effect of multiplying a 3 n matrix A on the left by P ,
producing P A, is the same as interchanging the first two rows of A. Multiple row interchanges are accomplished by multiplying such elementary interchange matrices together.
Each such combination of row interchanges corresponds to a unique permutation matrix.
Definition 1.8. A permutation matrix is a matrix obtained from the identity matrix
by any combination of row interchanges.

In particular, applying a row interchange to a permutation matrix produces another


permutation matrix. The following result is easily established.
3/7/03

16

c 2003

Peter J. Olver

Lemma 1.9. A matrix P is a permutation matrix if and only if each row of P


contains all 0 entries except for a single 1, and, in addition, each column of P also contains
all 0 entries except for a single 1.
In general, if a permutation matrix P has a 1 in position i, j, then the effect of
multiplication by P is to move the j th row of A into the ith row of the product P A.
Example 1.10. There are six

1 0 0
0 1 0,
0 0 1

0 1 0
1 0 0,
0 0 1

different 3 3 permutation

0 0
0 1 0
1 0
0 0 1,
0 1
1 0 0

0 0 1
1 0
0 1 0,
0 0
1 0 0
0 1

matrices, namely

1
0,
0

0
1.
0

(1.33)

These have the following effects on 3 3 matrices: if A is a matrix with row vectors
r1 , r2 , r3 , then multiplication on the left by each of the six permutation matrices produces






r1
r2
r3
r2
r3
r1
r2 ,
r3 ,
r1 ,
r1 ,
r2 ,
r3 ,
r3
r1
r2
r3
r1
r2
respectively. Thus, the first permutation matrix, which is the identity, does nothing the
identity permutation. The fourth, fifth and sixth represent row interchanges. The second
and third are more involved permutations; each can be realized by performing a pair of
row interchanges.
It is not hard to prove that there are a total of
n ! = n (n 1) (n 2) 3 2 1

(1.34)

different permutation matrices of size n n. The product P = P1 P2 of any two permutation matrices is also a permutation matrix. An important point is that multiplication
of permutation matrices is noncommutative the order in which one permutes makes a
difference. Switching the first and second rows, and then switching the second and third
rows does not have the same effect as first switching the second and third rows and then
switching the first and second rows!
The Permuted L U Factorization
As we now know, any nonsingular matrix A can be reduced to upper triangular form
by elementary row operations of types #1 and #2. The row interchanges merely reorder
the equations. If one performs all of the required row interchanges in advance, then
the elimination algorithm can proceed without requiring any further pivoting. Thus, the
matrix obtained by permuting the rows of A in the prescribed manner is regular. In other
words, if A is a nonsingular matrix, then there is a permutation matrix P such that the
3/7/03

17

c 2003

Peter J. Olver

product P A is regular, and hence admits an L U factorization. As a result, we deduce the


general permuted L U factorization
P A = L U,

(1.35)

where P is a permutation matrix, L is special lower triangular, and U is upper triangular


with the pivots on the diagonal. Thus, Theorem 1.3 implies the following generalization.
Theorem 1.11. An n n matrix A is nonsingular if and only if it has n nonzero
pivots if and only if it admits a permuted L U factorization (1.35).
For the preceding coefficient matrix (1.30), we permuted the first and second rows,
and hence equation (1.35) has the explicit form

0 1 0
0 2 1
2 6 1
1
0 0
1 0 02 6 1 = 0
1 00 2 1 .
(1.36)
1
9
0 0 1
1 1 4

1
1
0
0
2
2

One should be aware of a couple of practical complications. First, to implement the


permutation of the rows that makes A regular, one needs to be somewhat clairvoyant: it
is not always clear in advance when and where a row interchange will crop up. Second,
any row interchange performed during the course of the Gaussian Elimination algorithm
will affect the lower triangular matrix L, and precomputed entries must be permuted
accordingly to maintain the validity of the decomposition (1.35). See Exercise for an
example.
Once the permuted L U factorization is established, the solution to the original system
(1.10) is obtained by using the same Forward and Back Substitution algorithm presented
above. Explicitly, we first multiply the system A x = b by the permutation matrix, leading
to
b
P A x = P b b,
(1.37)

b has been obtained by permuting the entries of b in the same


whose right hand side b
fashion as the rows of A. We then solve the two systems
b
L c = b,

and

U x = c,

(1.38)

by, respectively, forward and back substitution as before.


Example 1.12. Suppose we

0
2
1

wish to solve

2 1
x
1
6 1 y = 2 .
1 4
z
0

In view of the P A = L U factorization established in (1.36), we need to solve the two auxiliary systems (1.38) by forward and back substitution, respectively. The lower triangular
system is

0 1 0
1
2
a
1
0 0
0
1 0 b = 1 = 1 0 0 2 ,
1
0 0 1
0
0
1 1
c
2
3/7/03

18

c 2003

Peter J. Olver

with solution
a = 2,

b = 1,

c = 12 a + b = 2.

The resulting upper triangular system is



2 6 1
x
2
a
0 2 1 y = 1 = b.
0 0 92
z
2
c

The solution, which is also the solution to the original system, is obtained by back substitution, with
z = 49 ,

y=

1
2

(1 z) =

5
18 ,

x=

1
2

(2 6 y z) = 37
18 .

1.5. Matrix Inverses.


We already encountered the notion of an inverse in our discussion of elementary matrices. The inverse of a matrix plays the same role in matrix multiplication that the
reciprocal a1 = 1/a of a scalar plays in ordinary multiplication. Let us begin with the
formal definition and then investigate some consequences.
Definition 1.13. Let A be a square matrix of size n n. An n n matrix X is
called the inverse of A if it satisfies
X A = I = A X,

(1.39)

where I = I n is the n n identity matrix. The inverse is commonly denoted by X = A 1 .


Remark : Noncommutativity of matrix multiplication requires that we impose both
conditions in (1.39) in order to properly define an inverse to the matrix A. The first
condition X A = I says that X is a left inverse, while the second A X = I requires that
X also be a right inverse, in order that it fully qualify as a bona fide inverse of A.
Not every square matrix has an inverse. Indeed, not every scalar has an inverse the
one counterexample being a = 0. There is no general concept of inverse for rectangular
matrices.

a b
Example 1.14. Let us compute the inverse of a general 2 2 matrix A =
.
c d

x y
We will write the inverse as X =
. The right inverse condition
z w

ax + bz ay + bw
1 0
AX =
=
= I
cx + dz cy + dw
0 1
holds if and only if x, y, z, w satisfy the linear system

3/7/03

a x + b z = 1,

a y + b w = 0,

c x + d z = 0,

c y + d w = 0.
19

c 2003

Peter J. Olver

Solving by Gaussian elimination (or directly), we find


x=

d
,
ad bc

y=

b
,
ad bc

z=

c
,
ad bc

w=

a
,
ad bc

provided the common denominator a d b c 6= 0 does not vanish. Therefore, the matrix

1
d b
X=
ad bc c a
forms a right inverse to A. However, a short computation shows that it also defines a left
inverse:

xa + yc xb + yd
1 0
XA =
=
= I,
za+wc zb+wd
0 1
and hence X = A1 is the inverse to A.
The denominator appearing in the preceding formulae has a special name; it is called
the determinant of the 2 2 matrix A, and denoted

a b
= a d b c.
det
c d
Thus, the determinant of a 2 2 matrix is the product of the diagonal entries minus the
product of the off-diagonal entries. (Section 1.9 discusses how to define the determinant
of a larger square matrix.) Thus, if det A 6= 0, the 2 2 matrix A is invertible, with

1
d b
1
.
(1.40)
A =
ad bc c a

1
3
For example, if A =
, then det A = 2 6= 0. We conclude that A has an inverse,
2 4
which, by (1.40), is
!


3
1 4 3
2 2
A1 =
.
=
1
2
1
2
1
2

On the other hand, if det A = 0, then A is a singular matrix and has no inverse.
The following key result result will be established later in this chapter.
Theorem 1.15. A square matrix A has an inverse if and only if it is nonsingular.
Consequently, an n n matrix will have an inverse if and only if it can be reduced
to upper triangular form with n nonzero pivots on the diagonal by a combination of
elementary row operations. All other matrices will be singular, and not have an inverse
as defined above. Before attempting to prove this fundamental result, and also establish
algorithms for computing the inverse of a nonsingular matrix, we need to first become
familiar with some elementary properties of matrix inverses.
Lemma 1.16. The inverse of a square matrix, if it exists, is unique.
3/7/03

20

c 2003

Peter J. Olver

Proof : If X and Y both satisfy (1.39), so X A = I = A X and Y A = I = A Y , then,


by associativity,
X = X I = X(A Y ) = (XA) Y = I Y = Y,
and hence X = Y .

Q.E.D.

Inverting a matrix twice gets us back to where we started.


Lemma 1.17. If A is invertible, then A1 is also invertible and (A1 )1 = A.
Proof : According to (1.39) A1 A = I = A A1 , which is enough to prove the result.
Q.E.D.
Example 1.18. We already learned how to find the inverse of an elementary matrix
of type #1; we just negate the one nonzero off-diagonal entry. For example, if

1 0 0
1 0 0
then
E 1 = 0 1 0 .
E = 0 1 0,
2 0 1
2 0 1

This reflects the fact that the inverse of the elementary row operation that adds twice the
first row to the third row is the operation of subtracting twice the first row from the third
row.

0 1 0
Example 1.19. Let P = 1 0 0 denote the elementary matrix that has the
0 0 1
effect of interchanging rows 1 and 2 of a matrix. Then P 2 = I , since doing the same
operation twice in a row has no net effect. This implies that P 1 = P is its own inverse.
Indeed, the same result holds for all elementary permutation matrices that correspond to
row operations of type #2. However, it is not true for more general permutation matrices.
Lemma 1.20. If A, B are invertible matrices of the same size, then their product,
A B, is invertible, and
(A B)1 = B 1 A1 .
(1.41)
Note particularly the reversal in order of the factors.
Proof : Let X = B 1 A1 . Then, by associativity,
X (A B) = B 1 A1 A B = B 1 B = I ,

(A B) X = A B B 1 A1 = A A1 = I .

Thus X satisfies (1.39) for A B and the result follows.

Q.E.D.

1 2
Example 1.21. One verifies, directly, that the inverse of A =
is A1 =
0
1

1 2
0 1
0 1
1
, while the inverse of B =
is B
=
. Therefore, the
0 1
1 0
1 0
inverse of

2 1
0 1
1 2
=
C = AB =
1 0
1 0
0 1
3/7/03

21

c 2003

Peter J. Olver

is
C

=B

0
1

1
0

1 2
0 1

0
1

1
2

We can straightforwardly generalize the preceding result. The inverse of a multiple


product of invertible matrices is the product of their inverses, in the reverse order :
1
1 1
(A1 A2 Am1 Am )1 = A1
m Am1 A2 A1 .

(1.42)

Warning: In general, (A + B)1 6= A1 + B 1 . This equation is not even true for


scalars (1 1 matrices)!
GaussJordan Elimination
Let us now present the basic algorithm used to compute the inverse matrix, known
as GaussJordan Elimination. A key point is that, for square matrices, we only need to
solve
AX = I
(1.43)
in order to compute X = A1 . The other equation in (1.39), namely X A = I , will then
follow as an automatic consequence. In other words, just as we witnessed in the 2 2
case, once a matrix forms a right inverse, then it automatically forms a left inverse, and
conversely!
The reader may well ask, then, why use both left and right inverse conditions in the
original definition? There are several good reasons. First of all, a rectangular matrix may
satisfy one of the two conditions having either a left inverse or a right inverse but
can never satisfy both; see Exercise . Moreover, even when we restrict our attention to
square matrices, starting with only one of the conditions makes the logical development
of the subject considerably more difficult, and not really worth the extra effort. Once we
have established the basic properties of the inverse of a square matrix, we can then safely
discard the superfluous second condition. Finally, when we generalize the notion of an
inverse to a linear operator in Chapter 7, then, unlike square matrices, we cannot dispense
with either one of the conditions.
According to (1.11), we can perform the matrix multiplication on the left hand side
of (1.43) column by column and equate the results to the corresponding columns of the
identity matrix In , which are



1
0
0
0
1
0



0
0
0

e1 =
e2 =
...
en =
(1.44)
... ,
... ,
... .



0
0
0
0

Thus, ei is the vector with 1 in the ith slot and 0s elsewhere. If x1 , . . . , xn denote the
columns of the inverse matrix X, then the matrix equation (1.43) is equivalent to n linear
systems
A x1 = e1 ,
A x 2 = e2 ,
...
A x n = en ,
(1.45)
3/7/03

22

c 2003

Peter J. Olver

all having the same coefficient


matrix.
As such, to solve them we need to form the n aug

mented matrices Mi = Ai | ei , i = 1, . . . , n, and then perform our Gaussian elimination


algorithm. But this would be a waste of effort. Since we will end up performing identical
row operations on each augmented
matrix, it ismore efficient
to combine them into one

large augmented matrix M = A | e1 . . . en = A | I , of size n 2 n, in which the right


hand sides e1 , . . . , en of our systems are placed into n different columns, which we then
recognize as forming the columns of an n n identity matrix. We may then systematically
apply our elementary row operations to reduce, if possible, the large augmented matrix so
that its first n columns are in upper triangular form.
Example 1.22. For example, to find the inverse of the coefficient matrix

0 2 1
A = 2 6 1,
1 1 4

we form the large augmented matrix

0 2
2 6
1 1

1
1
4

1 0

0 1

0 0

Applying the same elementary row operations as

2 6 1 0
0 2 1 1

1 1 4 0

0
0.
1

above, we first interchange the rows

1 0
0 0,
0 1

and then eliminate the nonzero entries below the first

2 6 1 0 1
0 2 1 1 0

0 2 72 0 12
Next eliminate the entry below the

2
0
0

second pivot:

6 1 0 1
2 1 1 0
0 92 1 12

pivot,

0
0.
1

0
0.
1

At this
stage, we have reduced our augmented matrix to the upper triangular form

U | C , which is equivalent to reducing the original n linear systems A x i = ei to n


upper triangular systems U xi = ci . We could therefore perform n back substitutions to
produce the solutions xi , which would form the individual columns of the inverse matrix
X = (x1 . . . xn ).
An alternative, and essentially
equivalent approach is to fully reduce the augmented

matrix to the form I | X in which the left hand matrix has become
matrix,
an identity

1
while the right hand side is the desired solution X = A . Indeed, I | X represents the
n trivial, but equivalent, linear systems I xi = xi with identity coefficient matrix. Since

3/7/03

23

c 2003

Peter J. Olver

we can implement this process using elementary row operations, the linear systems are
equivalent, having the same solutions, and thereby justifying the computation.
Thus, our goal is to apply elementary
row operations
to reduce a large augmented

matrix of the form M = A | I to one of the form I | X . The resulting right hand
matrix X = A1 is the inverse to the original matrix A. Now, the identity matrix has
0s below the diagonal, just like U . It also has 1s along the diagonal, whereas U has the
pivots (which are all nonzero) along the diagonal. Thus, the next phase in the procedure is
to make all the diagonal entries of U equal to 1. To do this, we need to introduce the last,
and least, of our elementary row operations. The required operation on linear systems is:
Linear System Operation #3 : Multiply an equation by a nonzero constant.
This operation does not change the solution, and so yields an equivalent linear system.
The corresponding row operation is:
Elementary Row Operation #3 : Multiply a row of the matrix by a nonzero scalar.

In our case, we divide the rows of the upper triangular augmented


matrix
U
|
C

by the diagonal pivots of U , producing a matrix of the form V | K where V is special


upper triangular , meaning it has all 1s along the diagonal. In our particular example, the
result of these three elementary row operations of Type #3 is

1
1 3 12 0
0
2

0 1 12 12
0
0 ,

0 0 1 2 1 2
9

where we multiplied the first and second rows by 21 and the third row by 29 .
We are now over half way towards our goal of an identity matrix on the left. We need
only make the entries above the diagonal equal to zero. This can be done by elementary
row operations of Type #1. Now we work backwards in accord with back substitution.
First, eliminate the nonzero entries in the third column lying above the (3, 3) entry; this
is done by subtracting one half the third row from the second and also from the first:

7
91
1 3 0 19 18

1
0 1 0 59
91 .
18

2
0 0 1 2
1
9

Finally, subtract
entry:

1
3

1 0

0 23
18

7
0 18

2
1

7
18
1
18
19

0 1
0 0
9
The final right hand matrix is our desired inverse:

7
23
18
18
7
1
A1 = 18
18
2
9

3/7/03

the second from the first to eliminate the remaining nonzero off-diagonal

24

19

2
9
19
2
9

2
9
19
2
9

.
c 2003

Peter J. Olver

The reader should verify that it does satisfy the two inverse conditions A A 1 = I = A1 A.
Let us now complete the proofs of the basic results on inverse matrices. First, we
need to determine the third type of elementary matrix corresponding to elementary row
operation of type #3. Again, this is obtained by performing the indicated elementary row
operation on the identity matrix. The result is a diagonal matrix , meaning that all its
off-diagonal entries are zero; see Exercise .
Thus, the elementary matrix that multiplies row i by the nonzero scalar c 6= 0 is the
diagonal matrix having c in the ith diagonal position, and 1s elsewhere along the diagonal.
The inverse elementary matrix E 1 is the diagonal matrix with 1/c in the ith diagonal
position and 1s elsewhere on the main diagonal; it corresponds to the inverse operation
that divides row i by c. For example, the elementary matrix that multiplies the second
row of a 3 n matrix by the scalar 3 is

1 0 0
1 0 0
and has inverse
E 1 = 0 13 0 .
(1.46)
E = 0 3 0,
0 0 1
0 0 1

We will sometimes write D = diag(c1 , . . . , cn ) for the n n diagonal matrix with diagonal entries dii = ci . For example, the elementary matrix (1.46) can be written as
E = diag(1, 3, 1).
The GaussJordan method tells us how to reduce any nonsingular square matrix A
to the identity matrix by a sequence of elementary row operations. Let E 1 , E2 , . . . , EN be
the corresponding elementary matrices. Therefore,
EN EN 1 E2 E1 A = I .

(1.47)

X = EN EN 1 E2 E1

(1.48)

We claim that the product


is the inverse of A. Indeed, formula (1.47) says that X A = I , and so X is a left inverse.
Furthermore, each elementary matrix has an inverse, and so by (1.42), X itself is invertible,
with
1
1
X 1 = E11 E21 EN
(1.49)
1 EN .

Therefore, multiplying the already established formula X A = I on the left by X 1 , we


find A = X 1 , and so, by Lemma 1.17, X = A1 as claimed. This completes the proof of
Theorem 1.15. Finally, equating A = X 1 to (1.49), and using the fact that the inverse of
an elementary matrix is also an elementary matrix, we have established:
Proposition 1.23.
elementary matrices.

Any nonsingular matrix A can be written as the product of

As an application, let prove that the inverse of a triangular matrix is also triangular.
Specifically:
Lemma 1.24. If L is a lower triangular matrix with all nonzero entries on the main
diagonal, then L is nonsingular and its inverse L1 is also lower triangular. In particular,
if L is special lower triangular, so is L1 . A similar result holds for upper triangular
matrices.
3/7/03

25

c 2003

Peter J. Olver

Proof : It suffices to note that if L has all nonzero diagonal entries, one can reduce
L to the identity by elementary row operations of Types #1 and #3, whose associated
elementary matrices are all lower triangular. Lemma 1.2 implies that the product (1.48)
is then also lower triangular. If L is special, then all the pivots are equal to 1 and so no
elementary row operations of Type #3 are required, so the inverse is a product of special
lower triangular matrices, and hence is itself special lower triangular. A similar argument
applies in the upper triangular cases.
Q.E.D.
Solving Linear Systems with the Inverse
An important motivation for the matrix inverse is that it enables one to effect and
immediate solution to a nonsingular linear system.
Theorem 1.25. If A is invertible, then the unique solution to the linear system
A x = b is given by x = A1 b.
Proof : We merely multiply the system by A1 . We find
x = A1 A x = A1 b,
as desired.

Q.E.D.

Thus, with the inverse in hand, a more direct way to solve our example (1.29) is to
multiply the right hand side by the inverse matrix:
5

7
2
23
2
x
18
18
9
6
7
1 = 5 ,
1
y =

18

7
18
9
6
z
2
2
1
3
1
9

reproducing (1.32).
However, while sthetically appealing, the solution method based on the inverse matrix is hopelessly inefficient as compared to forward and back substitution based on a
(permuted) L U factorization, and should not be used ! We shall discuss precisely why this
is the case in Section 1.7, but the reader (in contrast to what might have been taught in
an introductory linear algebra course) should never use the matrix inverse. In fact, the
computation of the inverse of a matrix is essentially useless for practical linear algebra
problems.
This is not to say that the inverse is completely uninteresting. Far from it! The
inverse continues to play a fundamental role in the theoretical side of linear algebra, as
well as providing important insight into the algorithms that are used in practice. But the
basic message of practical, applied linear algebra is that L U decomposition and Gaussian
Elimination are fundamental; inverses are for theoretical purposes only.
Remark : The reader may have learned a version of the GaussJordan algorithm for
solving a single linear system that replaces the back substitution step by a further application of all three types of elementary row operations in order to reduce the coefficient

matrix to the identity. In other words, we start with the augmented matrix M = A | b
3/7/03

26

c 2003

Peter J. Olver

and use all three types of elementary


row operations to produce (assuming nonsingularity)

the fully reduced form I | x representing the trivial, equivalent system I x = x with the
solution x to the original system in the final column. However, as we shall see, the back
substitution approach is much more efficient, and is the method of choice in all practical
situations.
The L D V Factorization
The GaussJordan construction leads to a slightly more detailed version of the L U
factorization, which is useful in certain situations. Let D denote the diagonal matrix having
the same diagonal entries as U ; In other words, D has the pivots on its diagonal and zeros
everywhere else. Let V be the special upper triangular matrix obtained from U by dividing
each row by its pivot, so that V has all 1s on the diagonal. We already encountered V
during the course of the GaussJordan method. It is easily seen that U = D V , which
implies the following result.
Theorem 1.26. A matrix A is regular if and only if it admits a factorization
A = L D V,

(1.50)

where L is special lower triangular matrix, D is a diagonal matrix having the nonzero
pivots on the diagonal, and V is special upper triangular.
For the matrix appearing in Example 1.5, we have U = D V , where

2 1 1
2 0 0
1 12
U = 0 3 0 ,
D = 0 3 0 ,
V = 0 1
0 0
0 0 1
0 0 1

producing the A = L D V decomposition


1 0 0
2
2 1 1
4 5 2 = 2 1 00
1 1 1
0
2 2 0

0
3
0

0
1

0
0
1
0

1
2

1
0

1
2

1
2

0 ,
1

0 .
1

In the more general case when row interchanges are required, one produces a permuted
L D V factorization in the following form.
Theorem 1.27. A matrix A is nonsingular if and only if there is a permutation
matrix P such that
P A = L D V,
(1.51)
where L, D, V are as before.
Lemma 1.28. If A = L U is regular, then the factors L and U are each uniquely
determined. The same holds for the A = L D V factorization.
3/7/03

27

c 2003

Peter J. Olver

Proof : Suppose

eU
e.
LU = L

Since the diagonal entries of all four matrices are non-zero, Lemma 1.24 implies that they
are invertible. Therefore,
e 1 L = L
e 1 L U U 1 = L
e 1 L
eU
e U 1 = U
e U 1 .
L

(1.52)

The left hand side of the matrix equation (1.52) is the product of two special lower triangular matrices, and so, according to Lemma 1.2, is itself special lower triangular with
1s on the diagonal. The right hand side is the product of two upper triangular matrices,
and hence is itself upper triangular. Comparing the individual entries, the only way such
a special lower triangular matrix could equal an upper triangular matrix is if they both
equal the diagonal identity matrix. Therefore,
e 1 L = I = U
e U 1 ,
L

e = L and U
e = U , and proves the result. The L D V version is an
which implies that L
immediate consequence.
Q.E.D.

Uniqueness does not hold for the more general permuted factorizations (1.35), (1.51)
since there may be various permutation matrices that place a matrix A in regular form
P A. Moreover, unlike the regular case, the pivots, i.e., the diagonal entries of U , are no
longer uniquely defined, but depend on the particular combination of row interchanges
employed during the course of the computation.

1.6. Transposes and Symmetric Matrices.


Another basic operation on a matrix is to interchange its rows and columns. If A is
an m n matrix, then its transpose, denoted AT , is the n m matrix whose (i, j) entry
equal the (j, i) entry of A; thus
bij = aji ,
For example, if
A=

1 2
4 5

3
6

where

then

B = AT .

1
T

A = 2
3

4
5.
6

Note that the rows of A are the columns of AT and vice versa. In particular, the transpose
of a row vector is a column vector, while the transpose of a column vector is a row vector:

1
v = 2 ,
then
vT = ( 1 2 3 ).
3

The transpose of a scalar, considered as a 1 1 matrix, is itself: c T = c for c R.

Remark : To conserve vertical space, we will often use the transpose notation, e.g.,
T
v = ( v1 , v2 , v3 ) , as a compact way of writing column vectors.
3/7/03

28

c 2003

Peter J. Olver

In the square case, transpose can be viewed as reflecting the matrix entries across
the main diagonal. For example,

1
3
2

2
0
4

1
1 3
5 = 2 0
8
1 5

2
4 .
8

In particular, the transpose of a lower triangular matrix is upper triangular and vice-versa.
Performing the transpose twice gets you back to where you started:
(AT )T = A.

(1.53)

Unlike the inverse, the transpose is compatible with matrix addition and scalar multiplication:
(1.54)
(A + B)T = AT + B T ,
(c A)T = c AT ,
c R.
The transpose is also compatible with matrix multiplication, but with a twist. Like the
inverse, the transpose reverses the order of multiplication:
(A B)T = B T AT .

(1.55)

The proof of (1.55) is a straightforward consequence of the basic laws of matrix multiplication. An important special case is the product between a row vector v T and a column
vector w. In this case,
vT w = (vT w)T = wT v,
(1.56)
because the product is a scalar and so equals its own transpose.
Lemma 1.29. The operations of transpose and inverse commute. In other words, if
A is invertible, so is AT , and its inverse is
AT (AT )1 = (A1 )T .

(1.57)

Proof : Let Y = (A1 )T . Then, according to (1.55),


Y AT = (A1 )T AT = (A A1 )T = I T = I .
The proof that AT Y = I is similar. We conclude that Y = (AT )1 .

Q.E.D.

Factorization of Symmetric Matrices


The most important class of square matrices are those that are unchanged by the
transpose operation.
Definition 1.30. A square matrix is called symmetric if it equals its own transpose:
A = AT .
3/7/03

29

c 2003

Peter J. Olver

Thus, A is symmetric if and only if its entries satisfy aji = aij for all i, j. In other
words, entries lying in mirror image positions relative to the main diagonal must be
equal. For example, the most general symmetric 3 3 matrix has the form

a b c
A = b d e .
c e f

Note that any diagonal matrix is symmetric; in particular I T = I . A lower or upper


triangular matrix is symmetric if and only if it is, in fact, a diagonal matrix.
The L D V factorization of a nonsingular matrix takes a particularly simple form if
the matrix also happens to be symmetric. This result will form the foundation of some
significant later developments.
Theorem 1.31. A symmetric matrix A is regular if and only if it can be factored as
A = L D LT ,

(1.58)

where L is a special lower triangular matrix and D is a diagonal matrix with nonzero
diagonal entries.
Proof : We already know, according to Theorem 1.26, that we can factorize
A = L D V.

(1.59)

We take the transpose of both sides of this equation and use (1.55), so
AT = (L D V )T = V T DT LT = V T D LT ,

(1.60)

where we used the fact that a diagonal matrix is automatically symmetric, D T = D. Note
that V T is special lower triangular, and LT is special upper triangular. Therefore (1.60)
gives the L D V factorization of AT .
In particular, if A = AT , then we can invoke the uniqueness of the L D V factorization,
cf. Lemma 1.28, to conclude that
L = V T,

and

V = LT ,

(which are the same equation in two different forms). Replacing V by L T in (1.59) suffices
to prove the factorization (1.58).
Q.E.D.
Example 1.32. Let us find the L D LT factorization of the particular symmetric
matrix

1 2 1
A = 2 6 1.
1 1 4

This is done by performing the usual Gaussian elimination algorithm. Subtracting twice
the first row from the second and also the first row from the third produces the matrix
3/7/03

30

c 2003

Peter J. Olver

1 2
1
0 2 1 . We then
0 1 3
row, resulting in the upper

1
U = 0
0

add one half of the second row of the latter matrix to its third
triangular form

2 1
1 0
2 1 = 0 2
0 52
0 0


0
1 2
0 0 1
5
0 0
2

1
21 ,
1

where we perform a further factorization into a diagonal matrix D containing the pivots
and a special upper triangular matrix V obtained by dividing each row of U by its pivot.
On the other hand, the special lower triangular matrix associated with the row operations
is

1 0 0
L = 2 1 0,
1 21 1

which, as guaranteed by Theorem 1.31, is the transpose of V = LT . Therefore, the desired


A = L U = L D LT factorizations of this matrix are

1 0 0
1 2 1
1 2 1
2 6 1 = 2 1 0 0 2 1
1 12 1
0 0 52
1 1 4



1 0 0
1 0 0
1 2 1
= 2 1 0 0 2 0 0 1 12 .
1 12 1
0 0 1
0 0 52
Example 1.33. Let us look at a general 2 2 symmetric matrix

a b
A=
.
b c

(1.61)

Regularity of the matrix requires that the first pivot be a 6= 0. A single row operation will
place A in upper triangular form

a
c
U=
b2 .
0 ac
a
The associated lower triangular matrix is

1 0
L= b
.
a 1
Thus, A = L U . Finally,

a
0
D=
b2
0 ac
a

is just the diagonal part of U , and we find U = D LT , so that the L D LT factorization is


explicitly given by


b
a
0
a b
1 0
1
a .
(1.62)
= b
b2
b c
0 ac
0
1
a 1
a
3/7/03

31

c 2003

Peter J. Olver

Remark : If A = L D LT , then A is necessarily symmetric. Indeed,


AT = (L D LT )T = (LT )T DT LT = L D LT = A.
However, not every symmetric
has an L D LT factorization. A simple example is
matrix

0 1
the irregular 2 2 matrix
; see Exercise .
1 0

1.7. Practical Linear Algebra.


The illustrative examples used to explain the basic algorithms have all been small
(2 2 or 3 3) matrices. In such cases, or even for moderate sized matrices, the differences
between the various approaches to solving linear systems (Gauss, GaussJordan, matrix
inverse, etc.) are relatively unimportant, particularly if one has a decent computer or
even hand calculator to perform the tedious parts. However, applied mathematics will
often lead to much larger matrices, and then one must be efficient in order to stand any
chance of solving a linear system. For example, numerical solutions of ordinary differential
equations will typically lead to matrices with hundreds of entries, while numerical solution
of partial differential equations arising in fluid and solid mechanics, weather prediction,
image and video processing, chemical reactions, quantum mechanics, molecular dynamics,
and so on, will often lead to matrices with millions of entries. It is not hard for such
systems to tax even the most sophisticated supercomputer. Thus, it is essential that we
gain some familiarity with the computational details of the basic algorithms so we can
compare their efficiency, and, as a result, gain some experience with the issues underlying
the design of fast numerical algorithms.
A basic question is: how many arithmetic operations are required for each of our
algorithms? We shall keep track of additions and multiplications separately, since the
latter typically take slightly longer to perform in a computer processor. However, we
shall not distinguish between addition and subtraction, nor between multiplication and
division, as these typically rely on the same floating point algorithm. We shall also assume
that the matrices and vectors are generic, with few, if any, zero entries. Modifications
of the basic algorithms for sparse matrices, meaning ones with lots of zero entries, are
an important topic of research, since the large matrices that appear in applications to
differential equations are usually sparse. We refer the interested reader to more advanced
numerical linear algebra texts, e.g., [101, numla], for discussions of these more involved
issues.
First, for ordinary multiplication of an nn matrix A and a vector b R n , each entry
of the product A b requires n multiplications of the form aij bj and n 1 additions to sum
the resulting products. Since there are n entries, this means a total of n 2 multiplications
and n(n 1) = n2 n additions. Thus, for a matrix of size n = 100, one needs about
10, 000 distinct multiplications and a similar (but slightly fewer) number of additions. If
n = 1, 000, 000 = 106 then n2 = 1012 , which is phenomenally large, and the total time
required to perform the computation becomes a significant issue.
Let us now look at the regular Gaussian Elimination algorithm, referring back to our
basic program. First, we shall figure out how many arithmetic operations that use the j th
3/7/03

32

c 2003

Peter J. Olver

pivot mjj . There are n j rows lying below the j th pivot. For each row, we must perform
one division to compute the factor lij = mij /mjj . The entries lying in the same column
as the pivot will be made equal to zero automatically, and so we need only compute the
updated entries mik 7 mik lij mjk lying below and to the right of the pivot mjj . There
are (n j)2 such entries in the coefficient matrix and an additional n j entries in the
last column of the augmented matrix. Let us concentrate on the former for the moment;
for each of these, we must perform one multiplication and one addition. Therefore, for
the j th pivot there are a total of (n j)(n j + 1) multiplications including the initial
n j divisions needed to produce the lij and (n j)2 additions. Therefore, to reduce
a regular n n matrix to upper triangular form requires a total of
n
X

n3 n
(n j)(n j + 1) =
3
j =1

multiplications, and
(1.63)

n
X

2n3 3n2 + n
(n j)2 =
6
j =1

additions.

(In Exercise the reader is asked to prove these summation formulae by induction.) For
large n, both of these expressions are approximately equal to 31 n3 operations.
We should also be keeping track of the number of operations on the right hand side
of the equation. No pivots appear there, and so there are
n
X

j =1

(n j) =

n2 n
2

(1.64)

multiplications and the same number of additions required to produce the right hand side
in the resulting triangular system U x = c.
The next phase of the algorithm, back substitution, can be similarly analyzed. Working backwards, to find the value of xj , we already have computed xj+1 , . . . , xn , and the
formula

n
X
1
cj
xj =
uji xi
ujj
i=j+1

requires n j + 1 multiplications/divisions and n j additions. Therefore, the Back


Substitution algorithm requires
n
X

j =1
n
X

(n j + 1) =

n2 + n
2

multiplications, along with


(1.65)

n2 n
(n j) =
2
j =1

additions.

For n large, both of these are approximately equal to 12 n2 . Comparing with our initial discussion, we discover that back substitution requires about one half the number of
arithmetic operations as multiplying a matrix times a vector.
3/7/03

33

c 2003

Peter J. Olver

Forward substitution has the same operations count, except that since the diagonal
entries of L are all equal to 1, no divisions are required, and so we use a total of 12 (n2 n)
multiplications and the same number of additions. Thus, once we have computed the L U
decomposition of the matrix A, the Forward and Back Substitution algorithm requires
about n2 arithmetic operations of the two types, which is the same as the number of
operations needed to perform the matrix multiplication A1 b. Thus, even if we know the
inverse of a matrix, it is still just as efficient to use Forward and Back Substitution to
compute the solution!
As noted above, the computation of L and U requires about 31 n3 operations. On the
other hand, to complete the GaussJordan elimination scheme, we must perform all the
elementary row operations on the large augmented matrix, which has size n 2 n. Therefore, during the reduction to upper triangular form, there are an additional 12 n3 arithmetic
operations of each type required. Moreover, we then need to perform an additional 31 n3
operations to reduce U to the identity matrix, and a corresponding 12 n3 operations on the
right hand matrix, too. (All these are approximate totals, based on the leading term in the
actual count.) Therefore, GaussJordan requires a grand total of 53 n3 operations to complete, just to find A1 ; multiplying the right hand side to obtain the solution x = A1 b
involves another n2 operations. Thus, the GaussJordan method requires approximately
five times as many arithmetic operations, and so would take five times as long to complete, as compared to the more elementary Gaussian Elimination and Back Substitution
algorithm. These observations serve to justify our earlier contention that matrix inversion
is inefficient, and should never be used to solve linear systems in practice.

Tridiagonal Matrices

Of course, in special cases, the arithmetic operation count might be considerably reduced, particularly if A is a sparse matrix with many zero entries. A particularly important
case is that of a tridiagonal matrix
q

p1

A=

r1
q2
p2

r2
q3
..
.

r3
..
.
pn2

..

qn1
pn1

rn1
qn

(1.66)

in which all the entries except those on the main diagonal and the diagonals immediately
above and below it are zero. (Zero entries are left blank.) Such matrices arise in the
numerical solution of ordinary differential equations and the spline fitting of curves for
interpolation and computer graphics. If A is regular, it turns out that the factors L U are
3/7/03

34

c 2003

Peter J. Olver

lower and upper bidiagonal matrices,


1

l1

L=

1
l2

1
..
.

..

ln2

1
ln1

U =

u1
d2

u2
d3

u3
..
.

..

dn1

un1
dn
(1.67)

Multiplying out L U , and equating the result to A leads to the equations


d1 = q 1 ,

u1 = r 1 ,

l 1 d1 = p 1 ,

l1 u1 + d 2 = q 2 ,
..
.
lj1 uj1 + dj = qj ,

u2 = r 2 ,
..
.
uj = r j ,

l 2 d2 = p 2 ,
..
.
l j dj = p j ,

..
.
ln2 un2 + dn1 = qn1 ,

..
.

(1.68)

..
.
ln1 dn1 = pn1 ,

un1 = rn1 ,

ln1 un1 + dn = qn .
These elementary algebraic equations can be successively solved for the entries d 1 , u1 , l1 ,
d2 , u2 , l2 , d3 , u3 . . . . The original matrix A is regular provided the pivots d 1 , d2 , . . . are
never zero, which allows the recursive procedure to proceed.
Once the L U factors are in place, to solve a tridiagonal linear system A x = b, we
first solve L c = b by Forward Substitution, which leads to the recursive equations
c1 = b1 ,

c 2 = b2 l1 c1 ,

...

cn = bn ln1 cn1 .

(1.69)

We then solve U x = c by Back Substitution, again recursively:


xn =

cn
,
dn

xn1 =

cn1 un1 xn
,
dn1

...

x1 =

c 1 u 1 x2
.
d1

(1.70)

There are a total of 5 n 4 multiplications/divisions and 3 n 3 additions/subtractions


required to solve a general tridiagonal system of n linear equations an striking improvement over the general case.
Example 1.34. Consider the n n tridiagonal matrix

4 1
1 4 1

1 4 1

1 4 1
A=

.
.
.

.. .. ..

1 4 1
1 4
3/7/03

35

c 2003

Peter J. Olver

in which the diagonal entries are all qi = 4, while the entries immediately above and below
the main diagonal are all pi = ri = 1. According to (1.68), the tridiagonal factorization
(1.67) has u1 = u2 = . . . = un1 = 1, while
d1 = 4,

lj = 1/dj ,

dj+1 = 4 lj ,

j = 1, 2, . . . , n 1.

The computed values are

dj

lj

1
2
3
4
5
6
7

4
3.75
3.733333 . . .
3.732143 . . .
3.732057 . . .
3.732051 . . .
3.732051 . . .

.25
.2666666 . . .
.2678571 . . .
.2679426 . . .
.2679487 . . .
.2679491 . . .
.2679492 . . .

..
.

..
.

..
.

These converge rapidly to

dj 2 + 3 = 3.732051 . . . ,

3 = .2679492 . . . ,

which makes the factorization for large n almost trivial. The number 2 + 3 happens to
be the positive root of the quadratic equation
x=4

1
x

lj 2

or

x2 4x + 1 = 0.

The explanation of this observation will be revealed in Chapter 9.


Practical Pivoting
Let us now consider the irregular cases where row interchanges are required. In a
computer implementation, there is no need to waste processor time physically exchanging
the rows in memory. Rather, one introduces a separate array of pointers that serve to
indicate which original row is currently in which permuted position. More specifically, one
initializes n row pointers (1) = 1, . . . , (n) = n. Interchanging row i and row j of the
coefficient or augmented matrix is then accomplished by merely interchanging (i) and
(j). Thus, to access a matrix element that is currently in row i of the augmented matrix,
one merely retrieves the element that is in row (i) in the computers memory. An explicit
implementation of this strategy is provided below.
3/7/03

36

c 2003

Peter J. Olver

Even when there is a nonzero element in a diagonal position that can serve as a pivot,
there may be good numerical reasons for exchanging rows in order to use a better pivoting
element. Here is a simple example. Consider the linear system
.01 x + 1.6 y = 32.1,

x + .6 y = 22.

(1.71)

The exact solution is


x = 10,

y = 20.

Suppose we are working with a very primitive calculator that only retains 3 digits of
accuracy. (Of course, this is not a very realistic situation, but the example could be
suitably modified to produce similar problems no matter how many digits of accuracy our
computer retains.) The augmented matrix is

.01 1.6 32.1


.
1
.6 22
Choosing the (1, 1) entry .01 as our pivot, we

.01
1.6
0 159.4

produce the upper triangular form

32.1

3188 .

Since our calculator has only three place accuracy, it will round the entries in the second
row, producing the augmented coefficient matrix

32.1
.01
1.6

.
0 159.0 3190
The solution by back substitution gives
y=

3190
= 20.0628 . . . ' 20.1.
159

Then
x = 100 (32.1 1.6 y) = 100 (32.1 32.16) ' 100 (32.1 32.2) = 10.
The relatively small error in y has produced a very large error in x not even its sign is
correct!
The problem is that the first pivot, .01, is small relative to the other element lying in
its column. Interchanging the two rows before doing the row operation leads to a resolution
of the difficulty even with such an inaccurate calculator! After the interchange, we have

1
.6 22
,
.01 1.6 32.1
which results in the rounded-off upper triangular form

1
.6
1
.6 22
'

0 1.59
0 1.594
31.88
3/7/03

37

22

31.9 .

c 2003

Peter J. Olver

Gaussian Elimination With Partial Pivoting


start
for i = 1 to n
set (i) = i
next i
for j = 1 to n
if m(i),j = 0 for all i j, stop; print A is singular
choose i > j such that m(i),j is maximal
interchange (i) (j)
for i = j + 1 to n
set l(i)j = m(i)j /m(j)j
for k = j + 1 to n + 1
set m(i)k = m(i)k lij m(j)k
next k
next i
next j
end

The solution by back substitution now gives


y = 31.9/1.59 = 20.0628 . . . ' 20.1,

x = 22 .6 y = 22 12.06 ' 22 12.1 = 9.9,

which is a respectable answer.


The general strategy, known as Partial Pivoting, says that at each stage, we should
use the largest legitimate (i.e., lying on or below the diagonal) element as the pivot,
even if the diagonal element is nonzero. Let us write out the basic program, which also
includes the previously mentioned row pointers so as to avoid physically performing the
row interchanges.
Partial pivoting will solve most problems, although there can still be difficulties. For
instance, it will not handle the system
10 x + 1600 y = 3210,

x + .6 y = 22,

obtained by multiplying the first equation in (1.71) by 1000. The tip-off is that, while the
entries in the column containing the pivot are smaller, those in its row are much larger. The
solution to this difficulty is Full Pivoting, in which one also performs column interchanges
preferably with a column pointer to move the largest legitimate element into the
pivot position. In practice a column interchange is just a reordering of the variables in
3/7/03

38

c 2003

Peter J. Olver

the system, which, as long as one keeps proper track of things, also doesnt change the
solutions.
Finally, there are some matrices that are hard to handle even with pivoting tricks.
These matrices are known as ill-conditioned , and are typically characterized by being
almost singular . A good example of an ill-conditioned matrix is the n n Hilbert
matrix

1
2

1
3
Hn =
1
4
.
..

1
n

1
2
1
3
1
4
1
5

1
3
1
4
1
5
1
6

1
4
1
5
1
6
1
7

...

..
.

...
..
.

1
n+1

1
n+2

1
n+3

...

..
.

..
.

...
...

1
n
1
n+1
1
n+2
1
n+3

..
.

1
2 n1

(1.72)

It is known that Hn is nonsingular for all n; a proof will appear in Theorem 3.36. However,
the solution of a linear system whose coefficient matrix is a Hilbert matrix H n , even for
moderately sized n, is a very challenging problem, even if one implements double precision
arithmetic on the computer. This is because the larger n, the closer Hn is, in a sense, to
being singular.
The reader is urged to try the following experiment. Fix a value of n around 20.
Choose a vector x R n . Compute b = Hn x directly. Then try to solve the system
Hn x = b by Gaussian Elimination. If it works for n = 20, try n = 50 or 100. This will
give you a good indicator of the degree of precision used by your computer program, and
the accuracy of the numerical solution algorithm.

1.8. General Linear Systems.


So far, we have only treated linear systems involving the same number of equations as
unknowns, and then only those with nonsingular coefficient matrices. These are precisely
the systems that always have a unique solution. We now turn to the problem of solving
a general linear system of m equations in n unknowns. The cases not covered as yet
are rectangular systems, m 6= n, as well as square systems with non-invertible coefficient
matrices. The basic idea underlying the Gaussian Elimination Algorithm for nonsingular
matrices can be straightforwardly adapted to these cases, too. One systematically utilizes
the same elementary row operations so as to place the coefficient matrix in a particular
reduced form, which generalizes the upper triangular form we aimed for in the square,
nonsingular cases.

This can be quantified by saying that their determinant is very small, but non-zero; see
Section 1.9.

3/7/03

39

c 2003

Peter J. Olver

Definition 1.35. An m n
following staircase structure:

...

0
0 ...
0

0
0 ...
0
0

..
..
.. . .
..

.
.
.
.
.
U =

0
0 ...
0
0

0
0 ...
0
0

..
.. . .
..
..

.
.
.
.
.
0
0 ...
0
0

matrix is said to be in row echelon form if it has the


...

...

...

...

...

...

...

...

...

...

...

..
.

...

..

...

...

...

...
..
.
...

0
..
.

0
..
.

...
..
.
...

...
..
.
...

..
.

..

..
.

...

0
..
.

0
..
.

0
..
.

0
..
.

...
..
.
...

..
.

..
.

..

The entries indicated by


are the pivots, and must be nonzero. The first r rows of U
each have a single pivot in them, but some of the columns may fail to contain a pivot. The
entries below the staircase, indicated by the solid line, are all zero, while the non-pivot
entries above the staircase, indicated by stars, can be anything. The last m r rows are
identically zero, and do not contain any pivots. There may, in exceptional situations, be
one or more initial all zero columns. Here is an explicit example of a matrix in row echelon
form:

0 3 1
0 2 5 1
0
0 0 1 2 1 8

0 0 0
0 2 4 1
0 0 0
0 0 0
0

The three pivots, which are the first three nonzero entries in each row, are, respectively,
3, 1, 2.
Proposition 1.36. Any matrix can be reduced to row echelon form by a sequence
of elementary row operations of Types #1 and #2.
In matrix language, Proposition 1.36 implies that if A is any m n matrix, then there
exist an m m permutation matrix P and an m m special lower triangular matrix L
such that
P A = L U,
(1.73)
where U is in row echelon form. The factorization is not unique.
The proof of this result is based on the general Gaussian elimination algorithm, which
proceeds as follows. Starting at the top left of the matrix, one searches for the first column
which is not identically zero. Any of the nonzero entries in that column may serve as the
pivot. Partial pivoting indicates that it is probably best to choose the largest one, although
this is not essential for the algorithm to proceed. One places the chosen pivot in the first
row of the matrix via a row interchange, if necessary. The entries below the pivot are
made equal to zero by the appropriate elementary row operations of Type #1. One then
proceeds iteratively, performing the same reduction algorithm on the submatrix consisting
3/7/03

40

c 2003

Peter J. Olver

of all entries strictly to the right and below the pivot. The algorithm terminates when
either there is a pivot in the last row, or all rows lying below the last pivot are identically
zero, and so no more pivots can be found.
Example 1.37. Let us illustrate the general Gaussian Elimination algorithm with a
particular example. Consider the linear system
x + 3y + 2z u
= a,
2 x + 6 y + z + 4 u + 3 v = b,

(1.74)

x 3 y 3 z + 3 u + v = c,

3 x + 9 y + 8 z 7 u + 2 v = d,
of 4 equations in 5 unknowns, where a, b, c, d are specified numbers . The coefficient matrix
is

1
3
2 1 0
6
1
4 3
2
(1.75)
A=
.
1 3 3 3 1
3
9
8 7 2
To solve the system, we introduce the augmented matrix

1
3
2 1 0 a

6
1
4 3 b
2
.

1 3 3 3 1 c

d
3
9
8 7 2

The (1, 1) entry can serve as the first pivot; we eliminate the entries below it by elementary
row operations, resulting in

1 3 2 1 0
a

0 0 3 6 3 b 2 a

.
0 0 1 2 1 c + a

0 0 2 4 2
d 3a

The second column contains no suitable nonzero entry to serve as the second pivot. (The
top entry already lies in a row with a pivot in it, and so cannot be used.) Therefore, we
move on to the third column, choosing the (2, 3) entry as our second pivot. Again, we
eliminate the entries below it, leading to

a
1 3 2 1 0

0 0 3 6 3
b 2a

5
1
0 0 0
0 0 c 3 b + 3 a .

0 0 0
0 4 d + 2 b 13 a
3

It will be convenient to consider a general right hand side here, although the reader may
initially prefer to initially assign specific values to a, b, c, d.

3/7/03

41

c 2003

Peter J. Olver

The final pivot is in the last


matrix in row echelon form:

1
0

0
0

column, and we use a row interchange to place the coefficient


3
0
0
0

2
3
0
0

1
6
0
0

0
3
4
0

b 2a
.

d + 23 b 13

a
3

c 1b+ 5a
3
3

(1.76)

There are three pivots, 1, 3, 4, sitting in positions (1, 1), (2, 3) and (3, 5). Note the
staircase form, with the pivots on the steps and everything below the staircase being zero.
We shall find the solution to our systems after a brief theoretical interlude.

Warning: In the augmented matrix, pivots can never appear in the last column,
representing the right hand side of the system. Thus, even if c 31 b + 35 a 6= 0, that entry
does not count as a pivot.
We now introduce the most important numerical quantity associated with a matrix.
Definition 1.38. The rank of a matrix A is the number of pivots.
For instance, the rank of the matrix (1.75) equals 3, since its reduced row echelon form
has three pivots. The rank of an m n matrix is bounded by min{m, n}, since there is at
most one pivot per row and one pivot per column. The only matrix of rank 0 is the zero
matrix, which has no pivots. Nonsingular matrices are also characterized by their rank.
Proposition 1.39. A square matrix of size n n is nonsingular if and only if its
rank is equal to n.
Indeed, since there can be at most one pivot per row and one per column, the only
way an n n matrix can end up having n pivots is if its reduced row echelon form is upper
triangular. But these are precisely the nonsingular matrices.
Interestingly, the rank of a matrix does not depend on which elementary row operations are performed along the way to row echelon form. Indeed, performing a different
sequence of row operations say using partial pivoting versus no pivoting can produce
a completely different row echelon form. The remarkable fact, though, is that all such row
echelon forms end up having exactly the same number of pivots, and this number is the
rank of the matrix. A formal proof of this fact will appear in Chapter 2.
Once the system has been reduced to one in row echelon form, the solution proceeds
by a version of the back substitution algorithm. The first step is to see if there are any
incompatibilities in the system. Suppose one of the rows in the row echelon form of the
coefficient matrix is identically zero, but the corresponding entry in the last column of
the augmented matrix is nonzero. What linear equation would this represent? Well, the
coefficients of all the variables are zero, and so the equation is of the form 0 = c, where c,
the number on the right hand side of the equation, is the entry in the last column. If c 6= 0,
then the equation cannot be satisfied. Consequently, the entire system has no solutions,
and is an incompatible linear system. On the other hand, if c = 0, then the equation is
merely 0 = 0, and so is trivially satisfied. Such all zero rows do not prevent there being a
solution to the system.
3/7/03

42

c 2003

Peter J. Olver

Indeed, if every zero row in the echelon form of the augmented matrix also has a zero
entry in the last column, then the system is compatible, and one or more solutions exist.
For example, the last row in the echelon form (1.76) is all zero, and hence the last entry in
the final column must also vanish in order that the system be compatible. Therefore, the
linear system (1.74) will have a solution if and only if the right hand sides a, b, c, d satisfy
the linear constraint
5
1
(1.77)
3 a 3 b + c = 0.
In general, if the system is incompatible, there is nothing else to do. Otherwise,
suppose that all compatibility constraints are satisfied. To find the solution, we work
backwards, starting with the last row that contains a pivot. Each variable that indexes a
pivot column can be solved for. The variables corresponding to columns without pivots
are not specified, and, in fact, can take arbitrary values.
Definition 1.40. In a linear system U x = c in row echelon form, the variables
corresponding to columns containing a pivot are called basic variables, while the variables
corresponding to the columns without a pivot are called free variables.
Example 1.41. For the preceding example, assuming the compatibility condition
(1.77), the reduced augmented matrix (1.76) is

a
1 3 2 1 0
0 0 3 6 3

b 2a

0 0 0

a
0 4 d + 32 b 13
3
0 0 0
0 0
0
The pivots are found in columns 1, 3, 5, and so the corresponding variables, x, z, v, are
basic; the other variables, y, u, are free. We can solve the system for the basic variables in
terms of the free variables.
For instance, the particular values
a = 0,

b = 3,

c = 1,

d = 1,

satisfy the compatibility constraint (1.77). The resulting augmented echelon matrix (1.76)
corresponds to the system
x + 3y + 2z u
= 0,
3 z + 6 u + 3 v = 3,

4 v = 3,
0 = 0.
We now use Back Substitution to solve for the basic variables, leading to the general
solution
v = 34 ,

z = 1 + 2u + v = 14 + 2 u,

x = 3y 2z + u =

1
2

3 y 3 u.

In the general solution formula, the free variables y, v are completely arbitrary; any value
they take, e.g., y = 2, u = 1 , will give a solution to the original system: x = 3 + 27 ,
y = 2, z = 47 2 , u = 1 , v = 34 . But keep in mind that this is merely one of an
infinite number of different solutions.
3/7/03

43

c 2003

Peter J. Olver

In general, if the m n coefficient matrix of a system of m linear equations in n


unknowns has has rank r, there are m r all zero rows in the row echelon form, and these
m r equations must have zero right hand side in order that the system be compatible and
have a solution. Moreover, there are a total of r basic variables and n r free variables,
and so the general solution depends upon n r parameters.
Summarizing the preceding discussion, we have learned that are only three possible
outcomes for the solution to a general linear system.
Theorem 1.42. A system
(i )
(ii )
(iii )

A x = b of m linear equations in n unknowns has either


exactly one solution,
no solutions, or
infinitely many solutions.

Therefore, a linear system can never have a finite number other than 0 or 1
of solutions. If the system has more than one solution, e.g., 2 different solutions, it must
have infinitely many. This result is in clear contrast with nonlinear systems. For example,
a quadratic equation a x2 + b x + c = 0 can have either 0, 1, or 2 real solutions (and either
one or two complex solutions).
Case (ii) occurs if the system is incompatible, producing a zero row with a nonzero
right hand side. Case (iii) occurs if the system is compatible and there are one or more
free variables. This happens when the rank of the coefficient matrix is strictly less than
the number of columns: r < n. The first case occurs for nonsingular square coefficient
matrices, and, more generally, when r = n. Since r m, this can only happen if the
coefficient matrix has at least as many rows as columns, i.e., the linear system has at least
as many equations as unknowns.
Example 1.43. Consider the linear system
y + 4 z = a,

3 x y + 2 z = b,

x + y + 6 z = c,

consisting of three equations in three unknowns. The augmented coefficient matrix is


0 1 4 a
3 1 2 b .

1 1 6 c

Interchanging the first two rows, and then eliminating the elements below the first pivot
leads to

3 1 2
b
0 1
4
a .
4
16
0 3
c 13 b
3
The second pivot is in the (2, 2) position, but after eliminating the entry below it, we find
the row echelon form to be

3 1 2
b
0 1 4
.
a

1
4

0 0 0
c 3b 3a
3/7/03

44

c 2003

Peter J. Olver

No Solution

Unique Solution

Infinite # Solutions

Intersecting Planes.

Figure 1.1.

Since we have a row of all zeros, the original coefficient matrix is singular, and its rank is
only 2.
The compatibility condition for the system follows from this last row in the reduced
form, and so requires
4
1
3 a + 3 b c = 0.
If this is not satisfied, the system has no solutions; otherwise it has infinitely many. The
free variable is z, since there is no pivot in the third column, and the general solution is
y = a 4 z,

x=

1
3

b + 31 y 32 z =

1
3

a + 13 b 2z,

where z is arbitrary.
Geometrically, Theorem 1.42 is indicating something about the possible configurations
of linear subsets (lines, planes, etc.) of an n-dimensional space. For example, a single linear
equation a x + b y + c z = d defines a plane P in three-dimensional space. The solutions to
a system of three linear equations in three unknowns is the intersection P 1 P2 P3 of
three planes. Generically, three planes intersect in a single common point; this is case (i )
of the Theorem, and occurs if and only if the coefficient matrix is nonsingular. The case of
infinitely many solutions occurs when the three planes intersect on a common line, or, even
more degenerately, when they all coincide. On the other hand, parallel planes, or planes
intersecting in parallel lines, have no common point of intersection, and this corresponds
to the third case of a system with no solutions. Again, no other possibilities occur; clearly
one cannot have three planes having exactly 2 points in their common intersection it is
either 0, 1 or . Some possible configurations are illustrated in Figure 1.1.
Homogeneous Linear Systems
A linear system
Ax = 0

(1.78)

with all 0s on the right hand side is called a homogeneous system. On the other hand,
systems with nonzero right hand side A x = b, where b 6= 0 are called inhomogeneous.
Homogeneous systems are always compatible, since x = 0 is always a solution known as
3/7/03

45

c 2003

Peter J. Olver

the trivial solution. If the homogeneous system has a nontrivial, meaning nonzero, solution
x 6= 0, then Theorem 1.42 assures that it must have infinitely many solutions. This will
occur if and only if the rank of the coefficient matrix is less than the number of variables
or columns of the matrix, so r < n. In particular, since the rank is also always bounded by
the number of rows, r m, if a homogeneous system has fewer equations than unknowns,
so m < n, then it necessarily has infinitely many nontrivial solutions.
Theorem 1.44. A homogeneous linear system A x = 0 of m equations in n unknowns has a nontrivial solution x 6= 0 if and only if the rank of A is r < n. If m < n, the
system always has a nontrivial solution. If m = n, the system has a nontrivial solution if
and only if A is singular.
Example 1.45. Consider the homogeneous linear system
2 x1 + x2 + 5 x4 = 0,
with coefficient matrix

4 x1 + 2 x2 x3 + 8 x4 = 0,

A=
4
2

1
2
1

2 x1 x2 + 3 x3 4 x4 = 0,

0
5
1 8 .
3 4

Since the system is homogeneous and has fewer equations than unknowns, Theorem 1.44
assures us that it has infinitely many solutions, including the trivial solution x 1 = x2 =
x3 = x4 = 0. Moreover, again due to homogeneity, we do not need the augmented matrix,
since the final column of zeros will not be affected by the row operations, and so it is a
waste of time to carry it along. We therefore perform the Gaussian Elimination algorithm
directly on the coefficient matrix A. Working with the (1, 1) entry as the first pivot, we
first obtain

2 1 0
5
0 0 1 2 .
0 0 3
1
The (2, 3) entry is the second pivot, and we apply one final row operation to place the
matrix in row echelon form

2 1 0
5
0 0 1 2 .
0 0 0 5
This corresponds to the reduced homogeneous system
2 x1 + x2 + 5 x4 = 0,

x3 2 x4 = 0,

5 x4 = 0.

Since there are three pivots in the final row echelon form, the rank of the matrix A is
3. There is one free variable, namely x2 . Using Back Substitution, we easily obtain the
general solution
x2 = t,
x3 = x4 = 0,
x1 = 21 t,
which depends upon a single free parameter t = x2 .
3/7/03

46

c 2003

Peter J. Olver

Example 1.46. Consider the homogeneous linear system


2 x y + 3 z = 0,

4 x + 2 y 6 z = 0,

with coefficient matrix

2
4
A=
2
6

2 x y + z = 0,
1
2
1
3

6 x 3 y + 3 z = 0,

3
6
.
1
3

The system admits the trivial solution x = y = z = 0, but in this case we need to complete
the elimination algorithm before we can state whether or not there are other solutions.
After the first stage, the coefficient matrix has the form

2 1
0 0

0 0
0 0

3
0
.
2
6

To continue we need to interchange the second and third rows to place a nonzero entry in
the final pivot position; after that the reduction to row echelon form is immediate:

2 1
0 0

0 0
0 0

3
2

0
6

Thus, the system reduces to the equations


2 x y + 3 z = 0,

2 1
0 0

0 0
0 0

2 z = 0,

3
2
.
0
0

0 = 0,

0 = 0,

where the third and fourth equations are trivially compatible, as they must be in the
homogeneous case. The rank is equal to two, which is less than the number of columns,
and so, even though the system has more equations than unknowns, it has infinitely many
solutions. These can be written in terms of the free variable y, and so the general solution
x=

1
2

y,

z = 0,

where y is arbitrary.

1.9. Determinants.
The student may be surprised that, so far, we have avoided any mention of a topic that
often plays a central role in basic linear algebra: determinants. As with matrix inverses,
while determinants are of use in low-dimensional situations and for certain theoretical
developments, they are much less relevant for applications and practical computations.
The main reason is that the general determinantal formula is far too complicated to use
in practical computations. Indeed, the best way to compute a determinant is (surprise!)
Gaussian Elimination! However, for certain purposes, a familiarity with the basic theory
3/7/03

47

c 2003

Peter J. Olver

of determinants is important, and so we shall provide a very brief introduction in this final
section.
The determinant of a square matrix A, written det A, is a number that immediately
tells whether the matrix is singular or not. (Rectangular matrices do not have determinants.) We already encountered the determinant of a 2 2 matrix, which is equal to the
product of the diagonal entries minus the product of the off-diagonal entries:

a b
= a d b c.
(1.79)
det
c d
In Example 1.14, we saw that the determinant is nonzero if and only if the matrix has an
inverse. Thus, our goal is to generalize this simple observation to matrices of size n n.
There are many different ways to define determinants. The difficulty is that the actual formula is very unwieldy see (1.87) below and not well motivated. We prefer an
axiomatic approach that explains how our elementary row operations affect the determinant. In this manner, one can compute the determinant by Gaussian elimination, which
is, in fact, the fastest and most practical computational method in all but the simplest
situations. In effect, this remark obviates the need to ever compute a determinant.
Theorem 1.47. The determinant det A of a square matrix A is the uniquely defined
scalar quantity that satisfies the following axioms:
(1) Adding a multiple of one row to another does not change the determinant.
(2) Interchanging two rows changes the sign of the determinant.
(3) Multiplying a row by any scalar (including zero) multiplies the determinant by
the same scalar.
(4) Finally, the determinant function is fixed by setting
det I = 1.

(1.80)

Checking that all four of these axioms hold in the 2 2 case (1.79) is left as an
elementary exercise for the reader. A particular consequence of axiom 3 is that when we
multiply a row of any matrix A by the zero scalar, the resulting matrix, which has a row
of all zeros, necessarily has zero determinant.
Lemma 1.48. Any matrix with one or more all zero rows has zero determinant.
Since the basic determinantal axioms tell how determinants behave under all three
of our elementary row operations, we can use Gaussian elimination to compute a general
determinant.
Theorem 1.49. If A is a regular matrix, with A = L U factorization as in (1.23),
then
det A = det U =

n
Y

uii

(1.81)

i=1

3/7/03

48

c 2003

Peter J. Olver

equals the the product of the pivots. More generally, if A is nonsingular, and requires k
row interchanges to arrive at its permuted factorization P A = L U , then
det A = det P det U = (1)

n
Y

uii .

(1.82)

i=1

Finally, A is singular if and only if det A = 0.

Proof : In the regular case, one only need elementary row operations of type #1 to
reduce A to upper triangular form U , and axiom 1 says these do not change the determinant. Therefore det A = det U . Proceeding with the full GaussJordan scheme, the next
phase is to divide each row in U by its pivot, leading to the special upper triangular matrix
V with all 1s on the diagonal. Axiom 3 implies
!
n
Y
uii
det V.
(1.83)
det A = det U =
i=1

Finally, we can reduce V to the identity by further row operations of Type #1, and so by
(1.80),
det V = det I = 1.
(1.84)

Combining equations (1.83), (1.84) proves the theorem for the regular case. The nonsingular case follows without difficulty each row interchange changes the sign of the
determinant, and so det A equals det U if there have been an even number of interchanges,
but equals det U if there have been an odd number.
Finally, if A is singular, then we can reduce it to a matrix with at least one row of
zeros by elementary row operations of types #1 and #2. Lemma 1.48 implies that the
resulting matrix has zero determinant, and so det A = 0 also.
Q.E.D.
Corollary 1.50. The determinant of a diagonal matrix is the product of the diagonal
entries. The same result holds for both lower triangular and upper triangular matrices.
Example 1.51. Let us compute the

1
2
A=
0
1

determinant of the 4 4 matrix

0 1 2
1 3 4
.
2 2 3
1 4 2

We perform our usual Gaussian Elimination algorithm, successively leading to the matrices

1 0 1 2
1 0 1 2
1 0 1 2
0 1 1 0
0 1 1 0
0 1 1 0
A 7
7
7
,
0 2 2 3
0 0 0
3
0 0 2 4
0 1 3 4
0 0 2 4
0 0 0
3

where we used a single row interchange to obtain the final upper triangular form. Owing
to the row interchange, the determinant of the original matrix is 1 times the product of
the pivots:
det A = 1 1 1 ( 2) 3 = 6.
3/7/03

49

c 2003

Peter J. Olver

In particular, this tells us that A is nonsingular. But, of course, this was already implied
by the elimination, since the matrix reduced to upper triangular form with 4 pivots.
Let us now present some of the basic properties of determinants.
Lemma 1.52. The determinant of the product of two square matrices of the same
size is the product of the determinants:
det(A B) = (det A) (det B).

(1.85)

Proof : The product formula holds if A is an elementary matrix by the fundamental properties of the determinant combined with Corollary 1.50. By induction, if A =
E1 E2 EN is a product of elementary matrices, then (1.85) also holds. Therefore, the
result holds whenever A is nonsingular. On the other hand, if A is singular, then it can
be written as a product A = E1 E2 EN Z, where Z is a matrix with a row of zeros. But
then Z B = W also has a row of zeros, and so A B = E1 E2 EN W is also singular. Thus,
both sides of (1.85) are zero in this case.
Q.E.D.
It is a remarkable fact that, even though matrix multiplication is not commutative, and
so A B 6= B A in general, it is nevertheless always true that both products have the same
determinant: det(A B) = det(B A), since both are equal to the product (det A)(det B) of
the determinants of A and B, and ordinary multiplication is commutative.
Lemma 1.53. Transposing a matrix does not change its determinant:
det AT = det A.

(1.86)

Proof : By inspection, this formula holds if A is an elementary matrix. If A =


E1 E2 EN is a product of elementary matrices, then using (1.55), (1.85) and induction
T T
T
T
T
det AT = det(E1 E2 EN )T = det(EN
EN 1 E1T ) = (det EN
)(det EN
1 ) (det E1 )
= (det EN )(det EN 1 ) (det E1 ) = (det E1 )(det E2 ) (det EN )

= det(E1 E2 EN ) = det A.

The middle equality follows from the commutativity of ordinary multiplication. This proves
the nonsingular case; the singular case follows from Lemma 1.29, which implies that A T is
singular if and only if A is.
Q.E.D.
Remark : Lemma 1.53 has the interesting consequence that one can equally well use
elementary column operations to compute determinants. We will not develop this in
any detail here, since it does not help us to solve linear equations.
Finally, we state the general formula for a determinant. A proof of this formula based
on expansion can be found in [113].
Theorem 1.54. If A is an n n matrix with entries aij , then
X
det A =
a1,(1) a2,(2) an,(n) .

(1.87)

3/7/03

50

c 2003

Peter J. Olver

The sum in (1.87) is over all possible permutations of the columns of A. The
summands consist of all possible ways of choosing n entries of A with one entry in each
column and 1 entry in each row of A. The sign in front of the indicated term depends on
the permutation ; it is + if is an even permutation, meaning that its matrix can be
reduced to the identity by an even number of row interchanges, and is is odd. For
example, the six terms in the well-known formula

a11 a12 a13


a11 a22 a33 + a12 a23 a31 + a13 a21 a 32
(1.88)
det a21 a22 a23 =
a11 a23 a32 a12 a21 a33 a13 a22 a31
a31 a32 a33
for a 3 3 determinant correspond to the six possible 3 3 permutation matrices (1.33).
Unfortunately, the explicit determinant formula (1.87) contains n ! terms, and so,
as soon as n is even moderately large, is completely impractical for computation. The
most efficient way is still our mainstay Gaussian Elimination coupled the fact that the
determinant is the product of the pivots!
The proof that (1.87) obeys the basic determinantal axioms is not difficult, but, will
not be done here. The reader might wish to try the 3 3 case to be convinced that it
works. This verification will provide the complete proof that the determinant function
is well-defined, with explicit formula (1.87) that satisfies all four basic axioms. In this
manner, we complete the proof of Theorem 1.47.

3/7/03

51

c 2003

Peter J. Olver

Chapter 2
Vector Spaces
Vector spaces and their ancillary structures form the foundation of linear mathematics, and are an essential prerequisite for understanding contemporary applied mathematics.
The key concepts of vector space, subspace, linear independence, span, and basis will appear, not only in linear systems of equations and the geometry of n-dimensional Euclidean
space, but also in the analysis of linear ordinary differential equations, linear partial differential equations, linear boundary value problems, all of Fourier analysis, numerical
approximations like the finite element method, and many, many other fields. Therefore,
in order to properly develop the wide variety of methods and applications in this text, it
is essential that we gain a firm working knowledge of basic vector space analysis.
One of the great triumphs of modern mathematics was the recognition that many
seemingly distinct constructions are, in fact, different manifestations of the same general
abstract structure. Moreover, proofs that appear complicated in any particular context
often turn out to be relatively straightforward when formulated in general vector space
terms. The abstract notion of a vector space serves to unify spaces of ordinary vectors, as
well as spaces of functions, such as polynomials, exponentials, trigonometric functions, as
well as spaces of matrices, spaces of linear operators, and so on, all in a common framework.
We will study the four fundamental subspaces associated with a matrix range, kernel,
corange and cokernel and explain how they help us understand the solution to linear
algebraic systems, including our first encounter with the all-pervasive linear superposition
principle. Applications in graph theory, arising in electrical circuits and then generalized
to mechanical structures, will form the final topic of this chapter.
The price that one pays for the increasing level of abstraction is that, while the
underlying mathematics is not that difficult, the student typically takes a long time to
assimilate the level of abstraction demanded. The best way to approach the subject is to
think in terms of concrete examples. First, make sure you understand what the concept or
theorem says in the case of ordinary Euclidean space R n . Once this is grasped, the next
important case to consider is an elementary function space, e.g., the space of continuous
scalar functions. With these two examples firmly in hand, the leap to the general abstract
version should not be too hard. Patience is key; ultimately the only way to truly understand
an abstract concept like a vector space is by working with it! And keep in mind that the
effort expended here will be amply rewarded later on.

2.1. Vector Spaces.


A vector space is the abstract formulation of the most basic underlying properties
of n-dimensional Euclidean space R n , which is the space consisting of all real (column)
3/7/03

52

c 2003

Peter J. Olver

vectors with n entries. The basic laws of vector addition and scalar multiplication in
R n serve as the motivation for the general, abstract definition of a vector space. In the
beginning, we will refer to the elements of a vector space as vectors, even though, as we
shall see, they might also be functions or matrices or even more general objects. Unless
dealing with certain specific examples such as a space of functions, we will use bold face,
lower case Latin letters to denote the elements of our vector space.
Definition 2.1. A vector space is a set V equipped with two operations:
(i ) Addition: adding any pair of vectors v, w V produces another vector v + w V ;
(ii ) Scalar Multiplication: multiplying a vector v V by a scalar c R produces a vector
cv V ;
which are required to satisfy the following axioms for all u, v, w V and all scalars c, d R:
(a) Commutativity of Addition: v + w = w + v.
(b) Associativity of Addition: u + (v + w) = (u + v) + w.
(c) Additive Identity: There is a zero element, denoted 0 V , satisfying
v + 0 = v = 0 + v.
(d) Additive Inverse: For each v V there is an element v V such that
v + ( v) = 0 = ( v) + v.
(e) Distributivity: (c + d) v = (c v) + (d v), and c (v + w) = (c v) + (c w).
(f ) Associativity of Scalar Multiplication: c (d v) = (c d) v.
(g) Unit for Scalar Multiplication: the scalar 1 R satisfies 1 v = v.
Note: We will use bold face 0 to denote the zero element of our vector space, while
ordinary 0 denotes the real number zero. The following identities are elementary consequences of the vector space axioms.
(h) 0 v = 0.
(i) (1) v = v.
(j) c 0 = 0.
(k) If c v = 0, then either c = 0 or v = 0.
Let us, as an example, prove (h). Let z = 0 v. Then, by the distributive property,
z + z = 0 v + 0 v = (0 + 0) v = 0 v = z.
Adding z to both sides of this equation, and making use of axioms (b), (d), (c), implies
that z = 0, which completes the proof. Verification of the other three properties is left as
an exercise for the reader.
Remark : For most of this chapter we will deal with real vector spaces, in which the
scalars are the real numbers R. Complex vector spaces, where complex scalars are allowed,
will be introduced in Section 3.6.
Example 2.2. As noted above, the prototypical example of a real vector space is the
T
space R n consisting of column vectors or n-tuples of real numbers v = ( v1 , v2 , . . . , vn ) .
3/7/03

53

c 2003

Peter J. Olver

Vector addition and scalar multiplication are defined in the usual manner:
v +w
1
1
v2 + w 2
v+w =
..

vn + w n

cv

c v2

cv =
.. ,
.
c vn
T

v
1

whenever

w
1

w
v2
, w = .2 .
v=
.
.
.
.
.
wn
vn

The zero vector 0 = ( 0, . . . , 0 ) . The fact that vectors in R n satisfy the vector space
axioms is an immediate consequence of the laws of vector addition and scalar multiplication.
Details are left to the reader.
Example 2.3. Let Mmn denote the space of all real matrices of size m n. Then
Mmn forms a vector space under the laws of matrix addition and scalar multiplication.
The zero element is the zero matrix O. Again, the vector space axioms are immediate
consequences of the basic laws of matrix arithmetic. (For the purposes of this example, we
ignore additional matrix properties, like matrix multiplication.) The preceding example of
the vector space R n = M1n is a particular case when the matrices have only one column.
Example 2.4. Consider the space

P (n) = p(x) = an xn + an1 xn1 + + a1 x + a0

(2.1)

consisting of all polynomials of degree n. Addition of polynomials is defined in the usual


manner. For example,
(x2 3 x) + (2 x2 5 x + 4) = 3 x2 8 x + 4.
Note that the sum p(x) + q(x) of two polynomials of degree n also has degree n.
(However, it is not true that the sum of two polynomials of degree = n also has degree n;
for example (x2 + 1) + ( x2 + x) = x + 1 has degree 1 even though the two summands
have degree 2. This means that the set of polynomials of degree = n is not a vector
space.) The zero element of P (n) is the zero polynomial. We can multiply polynomials
by scalars real constants in the usual fashion; for example if p(x) = x 2 2 x, then
3 p(x) = 3 x2 6 x. The proof that P (n) satisfies the basic vector space axioms is an easy
consequence of the basic laws of polynomial algebra.
Remark : We are ignoring the fact that one can also multiply polynomials; this is not
a vector space operation. Also, any scalar can be viewed as a constant polynomial, but one
should really regard these as two completely different objects one is a number , while
the other is a constant function. To add to the confusion, one typically uses the same
notation for these two objects; for instance, 1 could either mean the real number 1 or the
constant function taking the value 1 everywhere. The reader needs to exercise due caution
and judgment when interpreting each occurrence.
For much of analysis, including differential equations, Fourier theory, numerical methods, and so on, the most important examples of vector spaces are spaces of functions. The
simplest such example is the following.
3/7/03

54

c 2003

Peter J. Olver

Example 2.5. Let I R be an interval. Consider the function space


F = F(I) = { f : I R }
consisting of all real-valued functions f (x) which are defined for all x I. The claim is
that the function space F has the structure of a vector space. Addition of functions in F
is defined in the usual manner: (f + g)(x) = f (x) + g(x). Multiplication by scalars c R
is the same as multiplication by constants, (c f )(x) = c f (x). Again, the proof of the basic
vector space axioms is straightforward. As in the preceding remark, we are ignoring all
additional operations multiplication, division, inversion, composition, etc. that can
be done with functions; these are irrelevant for the vector space structure of F.
Remark : An interval can be
(a) closed , meaning that it includes its endpoints: I = [ a, b ],
(b) open, which does not include either endpoint: I = ( a, b ), or
(c) half open, which includes one but not the other endpoint, so I = [ a, b ) or ( a, b ].
An open endpoint is allowed to be infinite; in particular, ( , ) = R is another way of
writing the real line.
Example 2.6. The preceding examples are all, in fact, special cases of an even more
general construction. A clue is to note that the last example of a function space does not
make any use of the fact that the domain of definition of our functions is a real interval.
Indeed, the construction produces a vector space corresponding to any subset I R.
Even more generally, let S be any set. Let F = F(S) denote the space of all realvalued functions f : S R. Then we claim that V is a vector space under the operations
of function addition and scalar multiplication. More precisely, given functions f and g,
we define their sum to be the function h = f + g such that h(x) = f (x) + g(x) for all
x S. Similarly, given a function f and a real scalar c R, we define the scalar multiple
k = c f to be the function such that k(x) = c f (x) for all x S. The verification of the
vector space axioms proceeds straightforwardly, and the reader should be able to supply
the details.
In particular, if S R is an interval, then F(S) coincides with the space of scalar
functions described in the preceding example. If S R n is a subset of n-dimensional
Euclidean space, then the elements of F(S) are functions f (x1 , . . . , xn ) depending upon
the n variables corresponding to the coordinates of points x = (x 1 , . . . , xn ) S in the
domain. In this fashion, the set of real-valued functions defined on any domain in R n is
found to also form a vector space.
Another useful example is to let S = {x1 , . . . , xn } be a finite set of increasing real
numbers x1 < x2 < . . . < xn . A real-valued function f : S R is defined by its
values f (x1 ), f (x2 ), . . . f (xn ) at the specified points. In applications, one can view such
functions as the sample values of a scalar function f (x) at the sample points x 1 , . . . , xn .
For example, when measuring a physical quantity, e.g., temperature, velocity, pressure,
etc., one typically only measures a finite set of sample values. The intermediate, nonrecorded values between the sample points are then reconstructed through some form of
3/7/03

55

c 2003

Peter J. Olver

interpolation a topic that we shall visit in depth later on. Interestingly, the sample
values f (xi ) can be identified with the entries fi of a vector
T

f = ( f1 , f2 , . . . , fn ) = ( f (x1 ), f (x2 ), . . . , f (xn ) )

Rn,

known as the sample vector . Every sampled function f : {x1 , . . . , xn } R corresponds


to a unique vector f R n and vice versa. Addition of sample functions corresponds to
addition of their sample vectors, as does scalar multiplication. Thus, the vector space
of sample functions F(S) = F( {x1 , . . . , xn } ) is the same as the vector space R n ! This
connection between sampled functions and vectors will be the key to the finite Fourier
transform, of fundamental importance in modern signal processing.
Example 2.7. The preceding construction admits yet a further generalization. We
continue to let S be an arbitrary set. Let V be a vector space. The claim is that the space
F(S, V ) consisting of all functions f : S V is a vector space. In other words, we replace
the particular vector space R in the preceding example by a general vector space, and the
same conclusion holds. The operations of function addition and scalar multiplication are
defined in the evident manner: (f + g)(x) = f (x) + g(x) and (c f )(x) = c f (x), where we are
using the vector addition and scalar multiplication operations on V to induce corresponding
operations on V valued functions. The proof that F(S, V ) satisfies all of the vector space
axioms proceeds as before.
The most important example is when S R n is a domain in Euclidean space and
V = R m is itself a Euclidean space. In this case, the elements of F(S, R m ) consist of
vector-valued functions f : S R m , so that

f1 (x1 , . . . , xn )
f (x , . . . , xn )
f (x) = 2 1

...
fm (x1 , . . . , xn )
is a column vector consisting of m functions of n variables, all defined on a common
domain S. The general construction implies that addition and scalar multiplication of
vector-valued functions is done componentwise; for example
2

x2
ex 4

cos x
x

2 x2 cos x
2 ex x 8

2.2. Subspaces.
In the preceding section, we have already met the most basic examples of vector
spaces to be used in this text. Most of the important vector spaces in applications appear
as particular subsets of these examples.
Definition 2.8. A subspace of a vector space V is a subset W V which is a vector
space in its own right.
3/7/03

56

c 2003

Peter J. Olver

In other words, if v, w W , c R, then we compute the sum v + w, and scalar


product c v, considering v, w as elements of V , but require that the resulting vectors
belong to W . In particular, W must contain the zero element of V in order to satisfy
axiom (c).
The verification of the vector space axioms for a subspace is particularly easy. We
only need check that vector addition and scalar multiplication keep us within the subspace.
Proposition 2.9. A subset W V of a vector space is a subspace if and only if it
is closed under addition and scalar multiplication, which requires that
(a) for every v, w W , the sum v + w W , and
(b) for every v W and every c R, the scalar product c v W .
Proof : The proof is essentially trivial. For example, to show commutativity, given
v, w W , we can regard them as elements of V , in which case v + w = w + v because V
is a vector space. But the closure condition implies that the sum also belongs to W , and
so the commutativity axiom also holds for elements of W . The other axioms are equally
easy to verify.
Q.E.D.
Remark : It will sometimes be useful to combine the two closure conditions. Thus, to
prove W V is a subspace it suffices to check that c v + d w W for every v, w W and
c, d R.
Example 2.10. Let us list some examples of subspaces of the three-dimensional
Euclidean space R 3 :
(a) The trivial subspace W = {0}.
(b) The entire space W = R 3 .
(c) The set of all vectors of the form (x, y, 0), i.e., the (x, y)coordinate plane. Note
that the sum (x, y, 0)+(b
x, yb, 0) = (x+b
x, y+b
y , 0), and scalar multiple c (x, y, 0) = (c x, c y, 0),
of vectors in the (x, y)plane also lie in the plane, proving closure.
(d) The set of solutions (x, y, z) to the homogeneous linear equation
3 x + 2 y z = 0.
Indeed, if v = (x, y, z) is a solution, then so is any scalar multiple c v = (cx, cy, cz) since
3(cx) + 2(cy) (cz) = c(3x + 2y z) = 0.
b = (b
b = (x + x
Moreover, if v
x, yb, zb) is a second solution, the sum v + v
b, y + yb, z + zb) is also
a solution since
3 (x + x
b) + 2 (y + yb) (z + zb) = (3 x + 2 y z) + (3 x
b + 2 yb zb) = 0.

Note that the solution space is a two-dimensional plane consisting of all vectors which are
perpendicular (orthogonal) to the vector (3, 2, 1).
(e) The set of all vectors lying in the plane spanned by the vectors v1 = (2, 3, 0)
and v2 = (1, 0, 3). In other words, we consider all vectors of the form
v = a v1 + b v2 = a (2, 3, 0) + b (1, 0, 3) = (2 a + b, 3 a, 3 b),
3/7/03

57

c 2003

Peter J. Olver

where a, b R are arbitrary scalars. If v = a v1 + b v2 and w = b


a v1 + bb v2 are any two
vectors of this form, so is
c v + d w = c (a v1 + b v2 ) + d (b
a v1 + bb v2 ) = (a c + b
a d)v1 + (b c + bb d)v2 = e
a v1 + eb v2 ,

where e
a = ac + b
a d, eb = b c + bb d. This proves that the plane is a subspace. The reader
might already have noticed that this subspace coincides with the plane 3 x + 2 y z = 0
considered in item (d).
Example 2.11. The following subsets of R 3 are not subspaces.
(a) The set P of all vectors of the form (x, y, 1), i.e., the plane parallel to the
x y coordinate plane passing through (0, 0, 1). Indeed, 0 6 P , which is the most basic
requirement for a subspace. In fact, neither of the closure axioms hold for this subset.
(b) The positive orthant O + = {x > 0, y > 0, z > 0}. While the sum of two
vectors in O + belongs to O + , multiplying by negative scalars takes us outside the orthant,
violating closure under scalar multiplication.
(c) The unit sphere S 2 = { x2 + y 2 + z 2 = 1 }. Again, 0 6 S 2 . More generally,
curved surfaces, e.g., the paraboloid P = { z = x2 + y 2 }, are not subspaces. Although
0 P , most scalar multiples of vectors in P are not in P . For example, (1, 1, 2) P , but
2 (1, 1, 2) = (2, 2, 4) 6 P .
In fact, there are only four fundamentally different types of subspaces W R 3 of
three-dimensional Euclidean space:
(i ) The entire space W = R 3 ,
(ii ) a plane passing through the origin,
(iii ) a line passing through the origin,
(iv ) the trivial subspace W = {0}.
It is not hard to verify this observation: If W = {0} contains only the zero vector, then
we are in case (i). Otherwise, W R 3 contains a nonzero vector 0 6= v1 W . But since
all scalar multiples c v1 of elements of W belong thereunto, W must contain the entire
line in the direction of v1 . If W contains another vector v2 that does not lie in the line
through v1 , then it must contain the entire plane {c v1 + d v2 } spanned by v1 , v2 . Finally,
if there is a third vector v3 not contained in this plane, then we claim that W = R 3 . This
final fact will be an immediate consequence of general results in this chapter, although the
interested reader might try to prove it directly before proceeding.
Example 2.12. Let I R be an interval, and let F(I) be the space of real-valued
functions f : I R. Let us look at some of the most important examples of subspaces
of F(I). In each case, we need only verify the closure conditions to verify that the given
subset is indeed a subspace.
(a) The space P (n) of polynomials of degree n, which we already encountered.
S
(b) The space P () = n0 P (n) consisting of all polynomials.
(c) The space C0 (I) of all continuous functions. Closure of this subspace relies on
knowing that if f (x) and g(x) are continuous, then both f (x) + g(x) and cf (x) for any
c R are also continuous two basic results from calculus.
3/7/03

58

c 2003

Peter J. Olver

(d) More restrictively, one can consider the subspace Cn (I) consisting of all functions
f (x) that have n continuous derivatives f 0 (x), f 00 (x), . . . , f (n) (x) on I. Again, we need to
know that if f (x) and g(x) have n continuous derivatives, so do f (x) + g(x) and cf (x) for
any c R.
T
(e) The space C (I) = n0 Cn (I) of infinitely differentiable or smooth functions
is also a subspace. (The fact that this intersection is a subspace follows directly from
Exercise .)
(f ) The space A(I) of analytic functions on the interval I. Recall that a function
f (x) is called analytic at a point a if it is smooth, and, moreover, its Taylor series
f (a) + f 0 (a) (x a) +

1
2

f 00 (a) (x a)2 + =

X
f (n) (a)
(x a)n
n!
n=0

(2.2)

converges to f (x) for all x sufficiently close to a. (It does not have to converge on the entire
interval I.) Not every smooth function is analytic, and so A(I) ( C (I). An explicit
example is the function
1/x
e
,
x > 0,
(2.3)
f (x) =
0,
x 0.
It can be shown that every derivative of this function at 0 exists and equals zero: f (n) (0) =
0, n = 0, 1, 2, . . ., and so the function is smooth. However, its Taylor series at a = 0 is
0 + 0 x + 0 x2 + 0, which converges to the zero function, not to f (x). Therefore f (x)
is not analytic at a = 0.
(g) The set of all mean zero functions. The mean or average of a function defined
on a closed interval I = [ a, b ] is the real number
Z b
1
f (x) dx.
(2.4)
f=
ba a
Z b
f (x) dx = 0. Note that f + g = f + g,
In particular, f has mean zero if and only if
a

and so the sum of two mean zero functions also has mean zero. Similarly, cf = c f , and
any scalar multiple of a mean zero function also has mean zero.
(h) Let x0 I be a given point. Then the set of all functions f (x) that vanish at
the point, f (x0 ) = 0, is a subspace. Indeed, if f (x0 ) = 0 and g(x0 ) = 0, then clearly
(f + g)(x0 ) = 0 and c f (x0 ) = 0, proving closure. This example can be generalized to
functions that vanish at several points.
(i) The set of all solutions u = f (x) to the homogeneous linear differential equation
u00 + 2 u0 3 u = 0.
Indeed, if u = f (x) and u = g(x) are solutions, so are u = f (x) + g(x) and u = c f (x) for
any c R. Note that we do not need to actually solve the equation to verify these claims!

If I = [ a, b ] is closed, we use the appropriate one-sided derivatives at its endpoints.

3/7/03

59

c 2003

Peter J. Olver

They follow directly from linearity; for example


(f + g)00 + 2(f + g)0 3(f + g) = (f 00 + 2 f 0 3 f ) + (g 00 + 2 g 0 3 g) = 0.
Warning: In the last three examples, the value 0 is essential for the indicated set of
functions to be a subspace. The set of functions such that f (x0 ) = 1, say, is not a subspace.
The set of functions with a fixed nonzero mean, say f = 3, is also not a subspace. Nor is
the set of solutions to an inhomogeneous ordinary differential equation, say
u00 + 2 u0 3 u = x 3.
None of these subsets satisfy the closure conditions.

2.3. Span and Linear Independence.


The concept of the span of a collection of elements of a vector space generalizes, in
a natural fashion, the geometric notion of two vectors spanning a plane in R 3 . As such,
it forms the first of two important, general methods for constructing subspaces of vector
spaces.
Definition 2.13. Let v1 , . . . , vk be any elements of a vector space V . A linear
combination of these elements is given by a sum
c1 v1 + + c k vk =

k
X

ci vi ,

(2.5)

i=1

where the coefficients c1 , . . . , ck are any scalars.


For example,
3 v1 + v2 2 v 3 ,

8 v1 31 v3 , v2 = 0 v1 + v2 + 0 v3 ,

and

0 = 0 v 1 + 0 v2 + 0 v3 ,

are four different linear combinations of the three vector space elements v 1 , v2 , v3 V .
Definition 2.14. Let v1 , . . . , vk be a finite collection of elements of a vector space
V . Their span is the subset W = span {v1 , . . . , vk } V consisting of all possible linear
combinations (2.5).
The key observation is that a span always forms a subspace. Indeed, many subspaces
arise in this basic manner.
Proposition 2.15. The span of a collection of vectors, W = span {v1 , . . . , vk },
forms a subspace of the underlying vector space.
Proof : We need to show that if
v = c 1 v1 + + c k vk

and

are any two linear combinations, then their sum

3/7/03

b=b
v
c 1 v1 + + b
c k vk

b = (c1 + b
v+v
c1 )b
v1 + + (ck + b
ck )vk ,
60

c 2003

Peter J. Olver

is also a linear combination, as is any scalar multiple


a v = (a c1 )v1 + + (a ck )vk
for a R. This completes the proof.

Q.E.D.

Example 2.16. Examples of subspaces spanned by vectors in R 3 .


(i ) If v1 6= 0 is any non-zero vector in R 3 , then its span is the line { c v1 | c R } in
the direction of v1 . If v1 = 0, then its span just consists of the origin.
(ii ) If v1 and v2 are any two vectors in R 3 , then their span is the set of all vectors of
the form c1 v1 + c2 v2 . Typically, the span is a plane passing through the origin. However,
if v1 and v2 are parallel, then their span is just a line. The most degenerate case is when
v1 = v2 = 0, where the span is just a point the origin.
(iii ) If we are given three non-coplanar vectors v1 , v2 , v3 , then their span is all of R 3 ,
as we shall prove below. However, if they all lie in a plane, then their span is the plane
unless they are all parallel, in which case their span is a line, or, when v 1 = v2 = v3 = 0,
a single point.
Thus, any subspace of R 3 can be realized as the span of some set of vectors. Note
that we can also consider the span of four or more vectors, but the range of possible
subspaces is limited, as above, to either a point (the origin), a line, a plane, or the entire
three-dimensional space.
Example 2.17. Let V = F(R) denote the space of all scalar functions f (x).
(a) The span of the three monomials f1 (x) = 1, f2 (x) = x and f3 (x) = x2 is the set
of all functions of the form
f (x) = c1 f1 (x) + c2 f2 (x) + c3 f3 (x) = c1 + c2 x + c3 x2 ,
where c1 , c2 , c3 are arbitrary scalars (constants). In other words, span {1, x, x 2 } = P (2) is
the subspace of all quadratic (degree 2) polynomials.
(b) The next example arises in mechanics, electrical circuits and signal processing.
Let R be a fixed frequency. Let f1 (x) = cos x, f2 (x) = sin x. Their span consists
of all functions of the form
f (x) = c1 f1 (x) + c2 f2 (x) = c1 cos x + c2 sin x.

(2.6)

For example, the function cos( x + 2) lies in the span because, by the addition formula
for the cosine,
cos( x + 2) = cos 2 cos x sin 2 sin x
is a linear combination of cos x and sin x. In fact, we can express the entire span in an
alternative phase-amplitude form,
f (x) = c1 cos x + c2 sin x = r cos( x ).

(2.7)

Expanding the right hand side, we find


r cos( x ) = r cos cos x + r sin sin x
3/7/03

61

c 2003

Peter J. Olver

3
2
1
-4

-2

2
-1
-2
-3

Graph of 3 cos(2 x 1).

Figure 2.1.
and hence
c1 = r cos ,

c2 = r sin .

We can view the amplitude r > 0 and the phase shift as the polar coordinates of the
coefficients c1 , c2 . Thus, any combination of sin x and cos x can be rewritten as a single
cosine, with a phase lag. Figure 2.1 shows the particular case 3 cos(2 x 1) which has
amplitude r = 3, frequency = 2 and phase shift = 1. Note that the first peak appears
at x = / = 12 .
(c) The space T (2) of quadratic trigonometric polynomials is spanned by the functions
1,
cos x,
sin x,
cos2 x,
cos x sin x,
sin2 x.
Thus, the general quadratic trigonometric polynomial can be written as a linear combination
q(x) = c0 + c1 cos x + c2 sin x + c3 cos2 x + c4 cos x sin x + c5 sin2 x,
(2.8)
where c0 , . . . , c5 are arbitrary constants. A more useful spanning set for the same subspace
is the trigonometric functions
1,

cos x,

sin x,

cos 2 x,

sin 2 x.

(2.9)

Indeed, by the double angle formulas, both


cos 2 x = cos2 x sin2 x,

sin 2 x = 2 sin x cos x,

have the form of a quadratic trigonometric polynomial (2.8), and hence both belong to
T (2) . On the other hand, we can write
cos2 x =

1
2

cos 2 x + 12 ,

cos x sin x =

1
2

sin 2 x,

sin2 x = 12 cos 2 x + 12 ,

in terms of the functions (2.9). Therefore, (2.8) can be written in the alternative form

q(x) = c0 +

1
2

c3 +

1
2

c5 + c1 cos x + c2 sin x + 12 c3

=b
c0 + b
c1 cos x + b
c2 sin x + b
c3 cos 2 x + b
c4 sin 2 x,

3/7/03

62

1
2

c5

cos 2 x +

1
2

c4 sin 2 x
(2.10)

c 2003

Peter J. Olver

and so the functions (2.9) do indeed span T (2) . It is worth noting that we first characterized T (2) as the span of 6 functions, whereas the second characterization only required 5
functions. It turns out that 5 is the minimal number of functions needed to span T (2) , but
the proof of this fact will be deferred until Chapter 3.
(d) The homogeneous linear ordinary differential equation
u00 + 2 u0 3 u = 0.

(2.11)

considered in part (i) of Example 2.12 has two independent solutions: f 1 (x) = ex and
f2 (x) = e 3 x . (Now may be a good time for the reader to review the basic techniques for
solving linear, constant coefficient ordinary differential equations.) The general solution
to the differential equation is a linear combination
u = c1 f1 (x) + c2 f2 (x) = c1 ex + c2 e 3 x .
Thus, the vector space of solutions to (2.11) is described as the span of these two basic
solutions.
Remark : One can also define the span of an infinite collection of elements of a vector
space. To avoid convergence issues, one should only consider finite linear combinations
(2.5). For example, the span of the monomials 1, x, x2 , x3 , . . . is the space P () of all
polynomials. (Not the space of convergent Taylor series.) Similarly, the span of the
functions 1, cos x, sin x, cos 2 x, sin 2 x, cos 3 x, sin 3 x, . . . is the space of all trigonometric
polynomials, to be discussed in great depth in Chapter 11.
Linear Independence and Dependence
Most of the time, all the vector space elements used to form a span are essential. For
example, we cannot use fewer than two vectors to span a specified plane. However, in
the more degenerate cases, some of the spanning elements are not needed. For instance,
if the two vectors are parallel, then their span is a line, but only one of the vectors is
really needed to define the line. Similarly, the subspace of function space spanned by the
polynomials
p4 (x) = x2 + 1.
(2.12)
is the vector space P (2) of quadratic polynomials. But only three of the polynomials are
really required to span P (2) , as we will show below.
The elimination of such superfluous spanning elements is encapsulated in the following
basic definition.
p1 (x) = x 2,

p2 (x) = x2 5 x + 4,

p3 (x) = 3 x2 4 x,

Definition 2.18. The vectors v1 , . . . , vk V are called linearly dependent if there


exist a collection of scalars c1 , . . . , ck , not all zero, such that
c1 v1 + + ck vk = 0.

(2.13)

Vectors which are not linearly dependent are called linearly independent.
3/7/03

63

c 2003

Peter J. Olver

The restriction that the ci s not all simultaneously vanish is essential. Indeed, if
c1 = = ck = 0, then the linear combination (2.13) is automatically zero. To check
linear independence, one needs to show that the only linear combination that produces
the zero vector (2.13) is this trivial one. In other words, c1 = = ck = 0 is the one and
only solution to the vector equation (2.13).
Example 2.19. Some examples of linear independence and dependence:
(a) The vectors

1
0
1
v 3 = 4 ,
v 2 = 3 ,
v1 = 2 ,
3
1
1

are linearly dependent. Indeed,

v1 2 v2 + v3 = 0.
On the other hand, the first two vectors v1 , v2 are linearly independent. To see this,
suppose that


0
c1
c1 v1 + c 2 v2 = 2 c 1 + 3 c 2 = 0 .
0
c1 + c2

Thus, the coefficients c1 , c2 must satisfy the homogeneous linear system


c1 = 0,

2 c1 + 3 c2 = 0,

c1 + c2 = 0,

which has only the trivial solution c1 = c2 = 0.


(b) In general, any collection v1 , . . . , vk that includes the zero vector, say v1 = 0, is
automatically linearly dependent, since 1 v1 + 0 v2 + + 0 vk = 0 is a nontrivial linear
combination that adds up to 0.
(c) The polynomials (2.12) are linearly dependent; indeed,
p1 (x) + p2 (x) p3 (x) + 2p4 (x) 0
is a nontrivial linear combination that vanishes identically. On the other hand, the first
three polynomials, p1 (x), p2 (x), p3 (x), are linearly independent. Indeed, if
c1 p1 (x) + c2 p2 (x) + c3 p3 (x) = (c2 + 3 c3 ) x2 + (c1 5 c2 4 c3 ) x c1 + 3c2 0,
then c1 , c2 , c3 must solve the homogeneous linear system
c2 + 3 c3 = 0,

c1 5 c2 4 c3 = 0,

c1 + 3 c2 = 0.

But this has only the trivial solution c1 = c2 = c3 = 0, and so linear independence follows.
Remark : In the last example, we are using the basic fact that a polynomial is identically zero,
p(x) = a0 + a1 x + a2 x2 + + an xn 0
for all
x,
if and only if its coefficients all vanish: a0 = a1 = = an = 0. This is equivalent
to the self-evident fact that the basic monomial functions 1, x, x 2 , . . . , xn are linearly
independent; see Exercise .
3/7/03

64

c 2003

Peter J. Olver

Example 2.20. The set of quadratic trigonometric functions


1,

cos x,

sin x,

cos2 x,

sin2 x,

cos x sin x,

that were used to define the vector space T (2) of quadratic trigonometric polynomials, are,
in fact, linearly dependent. This is a consequence of the basic trigonometric identity
cos2 x + sin2 x 1
which can be rewritten as a nontrivial linear combination
1 + 0 cos x + 0 sin x cos2 x + 0 cos x sin x sin2 x 0
that gives the zero function. On the other hand, the alternative spanning set
1,

cos x,

sin x,

cos 2 x,

sin 2 x,

(2.14)

is linearly independent, since the only identically zero linear combination


c0 + c1 cos x + c2 sin x + c3 cos 2 x + c4 sin 2 x 0
is the trivial one c0 = . . . c4 = 0. However, the latter fact is not obvious, and requires a bit
of work to prove directly. An easy proof, based on orthogonality, will appear in Chapter 5.
Let us now focus our attention on the linear independence or dependence of a set
of vectors v1 , . . . , vk R n in Euclidean space. We begin by forming the n k matrix
A = ( v1 . . . vk ) whose columns are the given vectors. (The fact that we use column
vectors is essential here.) The key is a very basic formula
c
1

A c = c 1 v1 + + c k vk ,

where

c2

c=
.. ,
.
ck

(2.15)

that expresses any linear combination in terms of matrix multiplication. For example,


c1 + 3 c 2
1
3
0
1
3
0
c1
1 2
1 c2 = c1 + 2 c2 + c3 = c1 1 + c2 2 + c3 1 .
4
1
2
4 1 2
4 c1 c2 2 c 3
c3

Formula (2.15) is an immediate consequence of the basic rules of matrix multiplication;


see also Exercise c. It allows us to reformulate the notions of linear independence and
span in terms of linear systems of equations. The main result is the following:
Proposition 2.21. Let v1 , . . . , vk R n and let A = ( v1 . . . vk ) be the corresponding n k matrix.
(a) The vectors v1 , . . . , vk R n are linearly dependent if there is a non-zero solution
c 6= 0 to the homogeneous linear system A c = 0.
(b) The vectors are linearly independent if and only if the only solution to the homogeneous system A c = 0 is the trivial one c = 0.
(c) A vector b lies in the span of v1 , . . . , vk if and only if the linear system A c = b is
compatible, i.e., it has at least one solution.
3/7/03

65

c 2003

Peter J. Olver

Proof : We prove the first statement, leaving the other two as exercises for the reader.
The condition that v1 , . . . , vk be linearly dependent is that there is a nonzero vector
T

c = ( c1 , c2 , . . . , ck ) 6= 0
such that the linear combination
A c = c1 v1 + + ck vk = 0.
Therefore, linear dependence requires the existence of a nontrivial solution to the homogeneous linear system A c = 0.
Q.E.D.
Example 2.22. Let us determine whether the vectors

1
3
1
v3 = 4 ,
v2 = 0 ,
v1 = 2 ,
6
4
1

are linearly independent or linearly dependent.


a single matrix

1 3
A= 2 0
1 4


4
v4 = 2 ,
3

(2.16)

We combine them as column vectors into

1
4
6

4
2 .
3

According to Proposition 2.21, we need to figure out whether there are any nontrivial
solutions to the homogeneous equation A c = 0; this can be done by reducing A to row
echelon form, which is

1 3
1
4
U = 0 6 6 6 .
(2.17)
0 0
0
0
The general solution to the homogeneous system A c = 0 is

c = ( 2 c 3 c4 , c 3 c4 , c3 , c4 ) ,
where c3 , c4 the free variables are arbitrary. Any nonzero choice of c3 , c4 will produce
a nontrivial linear combination
(2 c3 c4 )v1 + ( c3 c4 )v2 + c3 v3 + c4 v4 = 0
that adds up to the zero vector. Therefore, the vectors (2.16) are linearly dependent.
In fact, Theorem 1.44 says that in this particular case we didnt even need to do the
row reduction if we only needed to answer the question of linear dependence or linear
independence. Any coefficient matrix with more columns than rows automatically has
a nontrivial solution to the associated homogeneous system. This implies the following
remark:
Lemma 2.23. Any collection of k > n vectors in R n is linearly dependent.
3/7/03

66

c 2003

Peter J. Olver

Warning: The converse to this lemma is not true. For example, the two vectors
T
T
v1 = ( 1, 2, 3 ) and v2 = ( 2, 4, 6 ) in R 3 are linearly dependent since 2 v1 + v2 = 0.
For a collection of n or fewer vectors in R n , one does need to perform the elimination to
calculate the rank of the corresponding matrix.
Lemma 2.23 is a particular case of the following general characterization of linearly
independent vectors.
Proposition 2.24. A set of k vectors in R n is linearly independent if and only if
the corresponding n k matrix A has rank k. In particular, this requires k n.
Or, to state the result another way, the vectors are linearly independent if and only if
the linear system A c = 0 has no free variables. The proposition is an immediate corollary
of Propositions 2.21 and 1.44.
Example 2.22. (continued) Let us now see which vectors b R 3 lie in the span
of the vectors (2.16). This will be the case if and only if the linear system A x = b has
a solution. Since the resulting row echelon form (2.17) has a row of all zeros, we know
that there will be a compatibility condition on the entries of b, and therefore not every
vector lies in the span. To find the precise condition, we augment the coefficient matrix,
and apply the same row operations, leading to the reduced augmented matrix

1 3
1
4
b1

0 6 6 6
.
b2 2 b 1

0 0
0
0 b3 + 76 b2 43 b1
T

Therefore, b = ( b1 , b2 , b3 ) lies in the span of these four vectors if and only if


34 b1 +

7
6

b2 + b3 = 0.

In other words, these four vectors only span a plane in R 3 , which passes through 0 and

T
has normal n = 43 , 76 , 1 .

In general, the same method shows that a collection of vectors will span all of R n if
and only if the row echelon form of the associated matrix contains no all zero rows, or,
equivalently, the rank is equal to n, the number of rows in the matrix.
Proposition 2.25. A collection of k vectors will span R n if and only if their n k
matrix has rank n. In particular, this requires k n.
Warning: Not every collection of n or more vectors in R n will span all of R n . A
counterexample is provided by the vectors (2.16).

2.4. Bases.
Given a vector space, a spanning set needs a sufficient number of distinct elements.
On the other hand, having too many elements in the spanning set will violate linear
independence, and cause redundancies. The optimal spanning sets are those that are also
linearly independent. Thus, by combining the concepts of span and linear independence,
we arrive at the essential concept of a basis.
3/7/03

67

c 2003

Peter J. Olver

A basis of a vector space V is a finite collection of elements

Definition 2.26.
v1 , . . . , vn V which

(a) span V , and


(b) are linearly independent.
Bases are absolutely fundamental in all applications of linear algebra, including matrix algebra, geometry of Euclidean space, solutions to linear differential equations, both
ordinary and partial, linear boundary value problems, Fourier analysis, signal and image
processing, data compression, control systems, and so on.
Example 2.27. The standard basis of R n consists of the n vectors

1
0

0

e1 =
... ,

0
0


0
1

0

e2 =
... ,

0
0

...


0
0

0

en =
... ,

0

(2.18)

so that ei is the vector with 1 in the ith slot and 0s elsewhere. We already encountered
these vectors as the columns of the n n identity matrix, cf. (1.44). They clearly span R n
since we can write any vector
x
1

x2

x=
.. = x1 e1 + + xn en ,
.
xn

(2.19)

as a linear combination, whose coefficients are the entries of x. Moreover, the only linear
combination that gives the zero vector x = 0 is the trivial one x1 = = xn = 0, and so
e1 , . . . , en are linearly independent.
Remark : In the three-dimensional case R 3 , a common physical notation for the standard basis is



0
0
1

(2.20)
k = e3 = 0 .
j = e2 = 1 ,
i = e1 = 0 ,
1
0
0
Interestingly, this notation has its origins in Hamiltons theory of quaternions, cf. [33].

There are many other bases of R 3 . Any three non-coplanar vectors will form a basis
this is a consequence of the following general characterization of bases in R n .
Theorem 2.28. Every basis of R n contains exactly n vectors. A set of n vectors
v1 , . . . , vn R n is a basis if and only if the n n matrix A = ( v1 . . . vn ) is nonsingular.
3/7/03

68

c 2003

Peter J. Olver

Proof : This is a direct consequence of Proposition 2.21. Linear independence requires


that the only solution to the homogeneous system A x = 0 is the trivial one x = 0.
Secondly, a vector b R n will lie in the span of v1 , . . . , vn if and only if the linear
system A x = b has a solution. Theorem 1.7 tells us that both results require that A be
nonsingular, i.e., have maximal rank n.
Q.E.D.
Thus, every basis of n-dimensional Euclidean space R n contains the same number of
vectors, namely n. This is a general fact, and motivates a linear algebra characterization
of dimension.
Theorem 2.29. Suppose the vector space V has a basis v1 , . . . , vn . Then every
other basis of V has of the same number of elements in it. This number is called the
dimension of V , and written dim V = n.
The proof of Theorem 2.29 rests on the following lemma.
Lemma 2.30. Suppose v1 , . . . , vn span a vector space V . Then every set of k > n
elements w1 , . . . , wk V is linearly dependent.
Proof : Let us write each element
wj =

n
X

aij vi ,

j = 1, . . . , k,

i=1

in terms of the spanning set. Then


c 1 w1 + + c k wk =

k
n X
X

aij cj vi .

i=1 j =1

This linear combination will be zero whenever there is a nonzero solution 0 6= c =


(c1 , . . . , ck ) to the homogeneous linear system of n equations in k > n unknowns
k
X

aij cj = 0,

i = 1, . . . , n.

j =1

Theorem 1.44 guarantees that every homogeneous system with more unknowns than equations always has a non-trivial solution, and this immediately implies that w 1 , . . . , wk are
linearly dependent.
Q.E.D.
Proof of Theorem 2.29 : Suppose we have two bases containing a different number of
elements. By definition, the smaller basis spans the vector space. But then Lemma 2.30
tell us that the elements in the larger purported basis must be linearly dependent. This
contradicts our assumption that both sets are bases, and proves the theorem.
Q.E.D.
As a direct consequence, we can now provide a precise meaning to the optimality
property of bases.
3/7/03

69

c 2003

Peter J. Olver

Theorem 2.31. Suppose V is an n-dimensional vector space. Then


(a) Every set of more than n elements of V is linearly dependent.
(b) No set of less than n elements spans V .
(c) A set of n elements forms a basis if and only if it spans V if and only if it is linearly
independent.
Example 2.32. The standard basis of the space P (n) of polynomials of degree n
is given by the n + 1 monomials 1, x, x2 , . . . , xn . (A formal proof of linear independence
appears in Exercise .) We conclude that the space P (n) has dimension n + 1. Thus, any
collection of > n + 1 polynomials of degree n is automatically linearly dependent. Any
other basis of P (n) must consist of n + 1 polynomials. However, not every collection of
n + 1 polynomials in P (n) is a basis they must be linearly independent. (See Exercise
for details.)
Remark : If a vector space V has no basis, it is either the trivial vector space V =
{0}, which by convention has dimension 0, or it has infinite dimension. Every infinitedimensional vector space contains an infinite collection of linearly independent vectors.
Examples of infinite-dimensional vector spaces include most spaces of functions, including
the spaces of continuous, differentiable, or mean zero functions, as well as the space of
all polynomials. On the other hand, the space of a solutions to a homogeneous linear
system of ordinary differential equations is a finite-dimensional vector space. The most
important example of an infinite-dimensional vector space, Hilbert space, to be discussed
in Chapter 11, is essential to modern analysis and function theory, as well as providing the
theoretical setting for all of quantum mechanics.
Lemma 2.33. The elements v1 , . . . , vn form a basis of V if and only if every x V
can be written uniquely as a linear combination thereof:
x = c 1 v1 + + c n vn =

n
X

ci vi

(2.21)

i=1

Proof : The condition that the basis span V implies every x V can be written as
some linear combination of the basis elements. Suppose we can write an element
x = c 1 v1 + + c n vn = b
c 1 v1 + + b
c n vn

as two different combinations. Subtracting one from the other, we find


(c1 b
c1 ) v1 + + (cn b
cn ) vn = 0.

Linear independence of the basis elements implies that the coefficients ci b


ci = 0. We
conclude that ci = b
ci , and hence the linear combinations are the same.
Q.E.D.
The coefficients (c1 , . . . , cn ) in (2.21) are called the coordinates of the vector x with
respect to the given basis. For the standard basis (2.18), the coordinates of a vector are its
entries i.e., its usual Cartesian coordinates; see (2.19). In many applications, an inspired
change of basis will lead to a better adapted coordinate systems, thereby simplifying the
computations.
3/7/03

70

c 2003

Peter J. Olver

Example 2.34. A Wavelet Basis. The vectors


1
1
1
1
1
1
v1 = ,
v2 =
v3 =
,
,
1
1
0
1
1
0

0
0
v4 =
,
1
1

(2.22)

form a basis of R 4 . This is verified by performing Gaussian elimination on the corresponding 4 4 matrix

1 1
1
0
1 1 1 0
A=
,
1 1 0
1
1 1 0 1
to check that A is nonsingular. This basis is a very simple example of a wavelet basis; the
general case will be discussed in Section 12.2. Wavelets arise in modern applications to
signal and digital image processing, [35].
How do we find the coordinates of a vector v relative to this basis? We need to find
the coefficients c1 , c2 , c3 , c4 such that
x = c 1 v1 + c 2 v2 + c 3 v3 + c 4 v4 .
In view of formula (2.15) we can rewrite this equation in matrix form
x = Ac

where

c = ( c 1 , c2 , c3 , c4 ) .
T

For example, solving the linear system for the vector x = ( 4, 2, 1, 5 ) produces the
unique solution c1 = 2, c2 = 1, c3 = 3, c4 = 2. These are the coordinates of x relative
to the wavelet basis:

0
1
1
1
4
0
1
1 1
2
x=
.
2
+ 3
= 2 v 1 v2 + 3 v 3 2 v 4 = 2
1
0
1
1
1
1
0
1
1
5

In general, to find the coordinates of a vector x with respect to a new basis of R n


requires the solution of a linear system of equations, namely
Ac = x

for

c = A1 x.

(2.23)

Here x = ( x1 , x2 , . . . , xn ) are the Cartesian coordinates of x, with respect to the standard


T
basis e1 , . . . , en , while c = ( c1 , c2 , . . . , cn ) denotes its coordinates with respect to the new
basis v1 , . . . , vn formed by the columns of the coefficient matrix A = [v1 , . . . , vn ].
Why would one want to change bases? The answer is simplification and speed
many computations and formulas become much easier, and hence faster, to perform in a
basis that is adapted to the problem at hand. In signal processing, the wavelet basis is
particularly appropriate for denoising, compression, and efficient storage of signals, such
as audio, still images, videos, medical imagery, geophysical images, and so on. These
processes would be quite time-consuming, if not impossible in the case of video processing,
to accomplish in the standard basis. Many other examples will appear throughout the
text.
3/7/03

71

c 2003

Peter J. Olver

2.5. The Fundamental Matrix Subspaces.


Let us now return to the general study of linear systems of equations, which we write
in our usual matrix form
A x = b.
(2.24)
Here A is an m n matrix, where m is the number of equations and n the number of
unknowns, i.e., the entries of x R n .
There are four important vector subspaces associated with any matrix, which play a
key role in the interpretation of our solution algorithm. The first two of these subspaces
are defined as follows.
Definition 2.35. The range of an an m n matrix A is the subspace rng A R m
spanned by the columns of A. The kernel or null space of A is the subspace ker A R n
consisting of all vectors which are annihilated by A, so
ker A = { z R n | A z = 0 } R n .

(2.25)

An alternative name for the range is the column space, since it is the span of the
columns of the matrix. A common alternative name for the kernel is the null space of the
matrix A. Both terms are used interchangeably in the literature.
Thus, the kernel of A is the set of solutions to the homogeneous system A z = 0 with
zero right hand side. The proof that ker A is a subspace requires us to verify the usual
closure conditions. If z, w ker A, then A z = 0 = A w. Therefore,
A(c z + d w) = c A z + d A w = 0
for any c, d R. This implies that c z + d w ker A, proving that ker A is a subspace.
This fact can be re-expressed as the following superposition principle for solutions to a
homogeneous system of linear equations.
Theorem 2.36. If z1 , . . . , zk are solutions to a homogeneous linear system A z = 0,
then so is any linear combination c1 z1 + + ck zk .
Warning: The set of solutions to an inhomogeneous linear system A x = b with b 6= 0
is not a subspace.

1 2 0
3
Example 2.37. Let us compute the kernel of the matrix A = 2 3 1 4 .
3 5 1 1
Since we are in essence solving the homogeneous system A x = 0, we only need perform
the elementary row operations
on A itself. The final result is the row echelon form U =

1 2 0
3
0 1 1 10 , which corresponds to the equations
0 0
0
0
x 2 y + 3w = 0,

3/7/03

72

y z 10 w = 0.

c 2003

Peter J. Olver

The free variables are z, w. The general solution to the homogeneous system is


17
2
2 z + 17 w
x
10
1
y z + 10 w
x= =
= z + w ,
0
1
z
z
1
0
w
w

which, for arbitrary scalars z, w, describes the most general vector in ker A. Thus, the
kernel of this matrix is the two-dimensional subspace of R 4 spanned by the linearly indeT
T
pendent vectors ( 2, 1, 1, 0 ) , ( 17, 10, 0, 1 ) .
Once we we know the kernel of the coefficient matrix A, the following basic theorem
gives a complete characterization of the solutions to the inhomogeneous linear system
(2.24).
Theorem 2.38. The linear system A x = b has a solution x? if and only if b lies in
the range of A. If this occurs, then the general solution to the linear system is
x = x? + z,

(2.26)

where z ker A is an arbitrary element of the kernel of A.


Proof : Let v1 , . . . , vn R m denote the columns of A. By our basic formula (2.15), if
x = (x1 , . . . , xn )T , then
A x = x1 v1 + + xn vn = b.
Therefore, b = A x if and only if b is a linear combination of the columns of A, which
proves the first part of the theorem. Secondly, if A x = b = A x? are any two solutions,
then their difference z = x x? satisfies
A z = A(x x? ) = A x A x? = b b = 0,
and hence z belongs to the kernel of A. Therefore, x and x? are related by formula (2.26),
which proves the second part of the theorem.
Q.E.D.
Therefore, to construct the most general solution to an inhomogeneous system, we
need only know one particular solution x? , along with the general solution z ker A to
the homogeneous equation. This construction should remind the reader of the method
of solution for inhomogeneous linear ordinary differential equations. Indeed, both linear
algebraic systems and linear ordinary differential equations are but two particular cases
of a general theory. Complete details will appear in Chapter 7. In particular, we can
characterize the case when the linear system has a unique solution in any of the following
equivalent ways.
Corollary 2.39. An m n matrix A has trivial kernel, ker A = {0}, if and only if
rank A = n if and only if there are no free variables if and only if the linear system A x = b
has a unique solution for each b rng A.
Specializing even further to square matrices, we can characterize its invertibility by
looking either at its kernel or at its range.
3/7/03

73

c 2003

Peter J. Olver

Proposition 2.40. A square matrix A is nonsingular if and only if it has trivial


kernel, ker A = {0} if and only if its range is the entire space rng A = R n .
Example 2.41. Consider the system A x = b, where


1 0 1
x1
A = 0 1 2 ,
x = x2 ,
x3
1 2 3

b1
b = b2 ,
b3

where the right hand side of the system will be left arbitrary. We shall perform our usual
Gaussian Elimination method, beginning with the augmented matrix

1 0 1 b1
0 1 2 b2 .

1 2 3 b3

Using the (1, 1) entry as our pivot, we first clear the entry beneath it; the resulting augmented matrix is

b1
1 0 1
0 1 2
b2 .

0 2 4 b3 b1

The second pivot is the (2, 2) entry, the resulting row echelon form is

1 0 1
b1
0 1 2
.
b2

0 0 0
b3 + 2 b 2 b 1

Since there are only two pivots, A has rank 2, and so its columns are linearly dependent.
The system has a solution if and only if the last reduced equation is compatible, which
requires that
b1 + 2 b2 + b3 = 0.
(2.27)
This compatibility condition is an equation that characterizes the range of the matrix A;
a vector b belongs to rng A if and only if its entries satisfy (2.27). Therefore, the range is
T
the plane in R 3 passing through the origin that is perpendicular to the vector ( 1, 2, 1 ) .
To characterize the kernel of A, we take b = 0, and solve the resulting homogeneous
system A z = 0. The reduced row echelon form is the same as above, and corresponds to
the reduced system
z1 z3 = 0,
z2 2 z3 = 0.
The free variable is z3 , and the equations are solved to give
z1 = c,

z2 = 2 c,

z3 = c,

where c is arbitrary. Thus, the general solution to the homogeneous system is z =


T
T
( c, 2 c, c ) = c ( 1, 2, 1 ) , and so the kernel is the line in the direction of the vector
T
( 1, 2, 1 ) .
3/7/03

74

c 2003

Peter J. Olver

If we take b = ( 3, 1, 1 ) which satisfies (2.27) and hence lies in the range of A


then the general solution to the inhomogeneous system A x = b is
x1 = 3 + c,

x2 = 1 + 2 c,

x3 = c,

where c is an arbitrary scalar. We write the solution in the form (2.26), so



1
3
3+c

x = 1 + 2 c = 1 + c 2 = x? + z,
1
0
c
T

where x? = ( 3, 1, 0 ) plays the role of the particular solution, and z = c ( 1, 2, 1 )


general element of the kernel.

is the

The Superposition Principle


The principle of superposition of solutions lies at the heart of linearity We shall explain
the superposition principle in the context of inhomogeneous linear algebraic systems. In
Chapter 7 we shall see that the general principle applies as stated to linear differential
equations, linear boundary value problems, linear integral equations, and all other linear
systems.
Suppose we have found particular solutions x?1 and x?2 to two inhomogeneous linear
systems
A x = b1 ,
A x = b2 ,
that have the same coefficient matrix A. Consider the system
A x = c 1 b1 + c 2 b2 ,
in which the right hand side is a linear combination or superposition of the previous two.
Then a particular solution to the combined system is given by the same linear combination
of the previous solutions:
x? = c1 x?1 + c2 x?2 .
The proof is easy; we use the rules of matrix arithmetic to compute
A x? = A(c1 x?1 + c2 x?2 ) = c1 A x?1 + c2 A x?2 = c1 b1 + c2 b2 .
In certain applications, the inhomogeneities b1 , b2 represent external forces, and the
solutions x?1 , x?2 represent the response of the physical apparatus to the force. The linear
superposition principle says that if we know how the system responds to the individual
forces, we immediately know its response to any combination thereof.
Example 2.42. For example, the system


4 1
x
f
=
1 4
y
g
models the mechanical response of a pair of masses connected by springs to an external
T
force. The solution x = ( x, y ) represent the respective displacements of the masses,
3/7/03

75

c 2003

Peter J. Olver

while the components of the right hand side f = ( f, g ) represent the respective forces
applied to each mass. (See Chapter 6 for full details.) Suppose we know the response

4
T
1 T
, 15
to a unit force f1 = ( 1, 0 ) on the first mass, and the
of the system x?1 = 15
1 4 T
T
response x?2 = 15
to a unit force f2 = ( 0, 1 ) on the second mass. We then know
, 15
the response of the system to a general force, since we can write



0
1
f
,
+g
= f f1 + g f2 = f
f=
1
0
g
and hence the solution is
x = f x?1 + g x?2 = f

4
15
1
15

+g

1
15

4
15

(4 f g)/15
( f + 4 g)/15

The preceding construction is easily extended, and the result is a general Superposition
Principle for inhomogeneous linear systems.
Theorem 2.43. Suppose that we know a particular solutions x?1 , . . . , x?k to each of
the inhomogeneous linear systems
A x = b1 ,

A x = b2 ,

...

A x = bk .

(2.28)

Then, for any choice of scalars c1 , . . . , ck , a particular solution to the combined system
A x = c 1 b1 + + c k bk

(2.29)

is the same superposition


x? = c1 x?1 + + ck x?k
of individual solutions. The general solution to (2.29) is
u = x? + z = c1 x?1 + + ck x?k + z,
where z is the general solution to the homogeneous equation A z = 0.
If we know particular solutions x?1 , . . . , x?m to
A x = ei ,

i = 1, . . . , m,

(2.30)

where e1 , . . . , em are the standard basis vectors of R m , (2.18), then we can reconstruct a
particular solution x? to the general linear system A x = b by first writing
b = b1 e1 + + b m em
as a linear combination of the basis vectors, and then using superposition to form
x? = b1 x?1 + + bm x?m .

(2.31)

However, for linear algebraic systems, the practical value of this insight is limited. Indeed, in the case when A is square and nonsingular, the superposition method is just a
reformulation of the method of computing the inverse of the matrix. Indeed, the vectors
3/7/03

76

c 2003

Peter J. Olver

x?1 , . . . , x?n which satisfy (2.30) are just the columns of A1 , cf. (1.44), and the superposition formula (2.31) is, using (2.15), precisely the solution formula x ? = A1 b that we
abandoned in practical computations, in favor of the more efficient Gaussian elimination
method. Nevertheless, the implications of this result turn out to be of great importance
in the study of linear boundary value problems.
Adjoint Systems, Cokernel, and Corange
A linear system of m equations in n unknowns has an m n coefficient matrix A.
The transposed matrix AT will be of size n m, and forms the coefficient of an associated
system consisting of n equations in m unknowns. There are important connections between
the two linear systems, as we now discuss.
Definition 2.44. The adjoint to a linear system A x = b of m equations in n
unknowns is the linear system
AT y = f
of n equations in m unknowns. Here y R m and f R n .
Example 2.45. Consider the linear system
x1 3 x 2 7 x 3 + 9 x 4 = b 1 ,

x 2 + 5 x 3 3 x 4 = b2 ,
(2.32)

1 3 1 9
is already
of two equations in four unknowns. Its coefficient matrix A =
0 1 2 8
in row echelon form. The free variables are x3 , x4 , and the general solution is

b1 + 3 b 2 8 x 3
b1 + 3 b 2
0
8
b

5
x
+
3
x
b
5

3
4=
2
x= 2

+ x 4 .
+ x3
x3
0
1
0
x4
1
0
0

In the second expression, the first vector is a particular solution and the remaining terms
constitute the general element of the two-dimensional kernel of A.
The adjoint system to (2.32) is the following four equations in two unknowns:
y1 = f 1 ,

3 y 1 + y2 = f 2 ,

with transposed coefficient matrix

1
3

7
9

0
1
5
3

f1

f2

f3

f4

7 y 1 + 5 y 2 = f3 ,
9 y 1 3 y 2 = f4 ,

1
0
3 1
AT =
. The reduced row echelon form is
7 5
9 3

1 0
f1

3 f1 + f2
0 1

.
0 0 8 f 1 5 f 2 + f3

0 0
3 f2 + f4

There are two compatibility conditions required for a solution:


8 f 1 5 f 2 + f3 = 0
3/7/03

77

3 f2 + f4 = 0.
c 2003

Peter J. Olver

If satisfied, the adjoint system has a unique solution


y1 = f 1 ,

y2 = 3 f 1 + f2 ,

and the transpose of A has trivial kernel: ker AT = {0}.


On the surface, there appears to be little connection between a linear system and its
adjoint. Nevertheless, as we shall soon see (and then in even greater depth in Section 5.6)
there are remarkable, but subtle interrelations between the two. These turn out to have
significant consequences, not only for linear algebraic systems but to even more profound
extensions to differential equations.
The adjoint system allows us to define the other two fundamental subspaces associated
with the coefficient matrix A.
Definition 2.46. The corange of an m n matrix A is the range of its transpose,
corng A = rng AT R n .
The corange coincides with the subspace of R n spanned by the rows of A, and is sometimes
referred to as the row space. The cokernel or left null space of A is the kernel of its
transpose,

coker A = ker AT = w R m AT w = 0 R m .
(2.33)
The Fundamental Theorem of Linear Algebra

The four fundamental subspaces associated with the matrix A, then, are its range,
corange, kernel and cokernel. The Fundamental Theorem of Linear Algebra states that
their dimensions are entirely prescribed by the rank (and size) of the matrix.
Theorem 2.47. Let A be an m n matrix of rank r. Then
dim corng A = dim rng A = rank A = rank AT = r,
dim ker A = n r,

dim coker A = m r.

(2.34)

Remark : Thus, the rank of a matrix indicates the number of linearly independent
columns, which, remarkably, is always the same as the number of linearly independent
rows! A matrix and its transpose have the same rank, i.e., the same number of pivots,
even though their row echelon forms are quite different, and not usually transposes of each
other. Theorem 2.47 also proves our earlier contention that the rank of a matrix is an
intrinsic quantity, and does not depend on which specific elementary row operations are
used, nor on the final row echelon form.

Not to be confused with the Fundamental Theorem of Algebra, Theorem 15.62, that states
that a polynomial has a complex root.

3/7/03

78

c 2003

Peter J. Olver

Proof : We show how to construct bases for each of the subspaces in a particular
instance, and thereby illustrate the method of proof of the Theorem. Consider the matrix

2 1 1
2
A = 8 4 6 4 .
4 2 3
2
The row echelon form of A is obtained in the usual

2 1 1
U = 0 0 2
0 0
0

manner:

2
4 .
0

Thus the rank of A is r = 2.


Kernel : We need to find the solutions to the homogeneous system A x = 0. The
general solution is expressed in terms of the n r free variables, which are the ones whose
columns do not contain a pivot. If these are xi1 , . . . , xinr , then we can expand the formula
for the general solution as a linear combination
x = xi1 z1 + xi2 z2 + + xinr znr

(2.35)

of certain vectors z1 , . . . , znr . The claim is that these vectors are linearly independent,
and so form a basis for ker A. Indeed, the i th entry of x in (2.35) is exactly the free
variable xi . Therefore, the linear combination (2.35) is trivial, x = 0 if and only if each
xi = 0, which immediately proves linearly independence of z1 , . . . , znr . Since the basis
contains n r vectors, this implies the dimension formula for ker A.
In our example, the pivots are in columns 1 and 3, and so the free variables are x 2 , x4 .
Using back substitution on the reduced homogeneous system U x = 0, we find the general
solution

1
1

2
2 x2 2 x 4
2

x2
0

(2.36)
x=
.
= x2 + x4
2

2x4
0

x4

Note that the second and fourth entries are the corresponding free variables x 2 , x4 . Therefore,

T
T
z1 = 12 1 0 0 ,
z2 = ( 2 0 2 1 ) ,

are the basis vectors for ker A. By construction, they span the kernel, and linear independence follows easily since the only way in which the linear combination (2.36) could
vanish, x = 0, is if both free variables vanish: x2 = x4 = 0.
Corange: The corange is the subspace of R n spanned by the rows of A. Applying an
elementary row operation does not alter the corange. To see this for row operations of the
b is obtained adding a times the first row of A to the second row.
first type, suppose that A
b are r , b
If r1 , r2 , . . . , rm are the rows of A, then the rows of A
1 r2 = r2 + a r1 , r3 , . . . , rm . If
v = c 1 r1 + c 2 r2 + c 3 r3 + + c m rm

3/7/03

79

c 2003

Peter J. Olver

is any vector in corng A, then


v=b
c 1 r1 + c 2 b
r2 + c 3 r3 + + c m rm ,

where

b
c 1 = c1 a c 2 ,

b the
is also a linear combination of the rows of the new matrix, and hence lies corng A.
converse is also valid, and we conclude that elementary row operations of Type #1 do not
change corng A. The proof for the other two types of elementary row operations is even
easier, and left to the reader.
Since the row echelon form U is obtained from A by a sequence of elementary row
operations, we conclude that corng A = corng U . Moreover, because each nonzero row
in U contains a pivot, it is not hard to see that the nonzero rows of corng U are linearly
independent, and hence form a basis of both corng U and corng A. Since there is one row
per pivot, corng U = corng A has dimension r. In our example, then, a basis for corng A
consists of the row vectors
s1 = ( 2 1 1 2 ),

s2 = ( 0 0 2 4 ).

The reader should verify their linear independence, and the fact that every row of A lies
in their span.
Range: There are two methods for computing a basis of the range or column space.
The first proves that it has dimension equal to the rank. This has the important, and
remarkable consequence that the space spanned by the rows of a matrix and the space
spanned by its columns always have the same dimension, even though they are, in general,
subspaces of different vector spaces.
Now the range of A and the range of U are, in general, different subspaces, so we
cannot directly use a basis for rng U to give a basis for rng A. However, the linear
dependencies among the columns of A and U are the same. It is not hard to see that the
columns of U that contain the pivots form a basis for rng U . Therefore, the same columns
of A form a basis for rng A. In particular, this implies that dim rng A = dim rng U = r.
In more detail, using our formula (2.15), we see that a linear combination of columns
of A is trivial,
c1 v1 + + cn vn = A c = 0,
if and only if c ker A. But we know ker A = ker U , and so the same linear combination
of columns of U , namely
U c = c1 u1 + + cn un = 0,
is also trivial. In particular, the linear independence of the pivot columns of U , labeled
uj1 , . . . , ujr , implies the linear independence of the same collection, vj1 , . . . , vjr , of columns
of A. Moreover, the fact that any other column of U can be written as a linear combination
uk = d 1 uj1 + + d r ujr
of the pivot columns implies that the same holds for the corresponding column of A, so
v k = d 1 v j1 + + d r v jr .
3/7/03

80

c 2003

Peter J. Olver

In our example, the pivots lie in the first and third columns of U , and hence the first
and third columns of A, namely

1
2
v3 = 6 ,
v1 = 8 ,
3
4

form a basis for rng A. Thus, every column of A can be written as a linear combination
of the first and third column.
An alternative method to find a basis for the range is to note that rng A = corng A T .
Thus, we can employ our previous algorithm to compute corng AT . In the particular case,
supplying Gaussian elimination to

leads to the row echelon form

2
1
AT =
1
2

8
4
6
4

2 8
0 2
b =
U

0 0
0 0

4
2

3
2

4
1
.
0
0

(2.37)

Observe that the row echelon form of AT is not the transpose of the row echelon form of
A! However, they do have the same number of pivots since both A and A T have the same
b , we conclude that
rank. Since the pivots of AT are in the first two columns of U

0
2
y2 = 2 ,
y1 = 8 ,
1
4
forms an alternative basis for rng A.
Cokernel : Finally, to determine a basis for the cokernel of the matrix, we apply
the preceding algorithm for finding a basis for the kernel to AT , using the identification
coker A = ker AT . Since the ranks of A and AT coincide, there are now mr free variables,
which is the same as the dimension of ker AT .
In our particular example, using the reduced form (2.37) of AT , the only free variable
is y3 , and the general solution is


0
1
1
y = 2 y3 = y 3 2 .
0

y3

We conclude that coker A is one-dimensional, with basis 0, 21 , 1 .


3/7/03

81

c 2003

Peter J. Olver

Figure 2.2.

Figure 2.3.

Three Different Graphs.

Three Versions of the Same Graph.

2.6. Graphs and Incidence Matrices.


We now present an application of linear systems to graph theory. A graph consists
of one or more points, called vertices, and lines or curves connecting them, called edges.
Both ends of an edge are vertices. For simplicity, we will always assume the ends are
distinct, so that no edge connects a vertex to itself. However, we do permit two vertices
to be connected by multiple edges. Some examples of graphs appear in Figure 2.2. The
vertices are the black dots. In a planar representation of the graph, the edges may cross
over each other at a non-nodal points, but do not actually meet think of a circuit where
the (insulated) wires lie on top of each other, but do not touch. Thus, the first graph in
Figure 2.2 has 4 vertices and 6 edges the two central edges do not meet; the second has
5 vertices and 8 edges; the final graph has 5 vertices and 10 edges.
Two graphs are considered to be the same if one can identify all their edges and
vertices, so that they have the same connectivity properties. A good way to visualize this
is to think of a graph as a collection of wires connected at the vertices. Moving the wires
around without cutting or rejoining them will have no effect on the underlying graph.
Consequently, there are many ways to draw a given graph; see Figure 2.3.
Graphs arise in a multitude of applications. A particular case that will be considered in
depth is electrical networks, where the edges represent wires, and the vertices represent the
nodes where the wires are connected. Another example is the framework for a building
3/7/03

82

c 2003

Peter J. Olver

1
1

A Simple Graph.

Figure 2.4.

the edges represent the beams and the vertices the joints where the beams are connected.
In each case, the graph encodes the topology meaning interconnectedness of the
system, but not its geometry lengths of edges, angles, etc.
Two vertices in a graph are adjacent if there is an edge connecting them. Two edges
are adjacent if they meet at a common vertex. For instance, in Figure 2.4, all vertices are
adjacent; edge 1 is adjacent to all edges except edge 5. A path in a graph is a sequence
of distinct (non-repeated) edges, with each edge adjacent to the next one in the sequence.
For example, in Figure 2.4, one path starts at vertex #1, then goes in order along the
edges labeled as 1, 4, 3, 2, thereby passing through vertices 1, 2, 4, 1, 3. Note that while an
edge cannot be repeated in a path, a vertex may be. A circuit is a path that ends up
where it began. For example, the circuit consisting of edges 1, 4, 5, 2 starts at vertex 1,
then goes to vertices 2, 4, 3 in order, and finally returns to vertex 1. The starting vertex
for a circuit is not important. For example, edges 4, 5, 2, 1 also represent the same circuit
we just described. A graph is called connected if you can get from any vertex to any other
vertex by a path. This is the most important case for applications. Every graph can be
decomposed into one or more connected subgraphs.
In electrical circuits, one is interested in measuring currents and voltage drops along
the wires in the network represented by the graph. Both of these quantities have a direction,
and therefore we need to specify an orientation on each edge in order to quantify how the
current moves along the wire. The orientation will be fixed by specifying the vertex the
edge starts at, and the vertex it ends at. Once we assign a direction to an edge, a
current along that wire will be positive if it moves in the same direction, i.e., goes from
the starting vertex to the ending one, and negative if it moves in the opposite direction.
The direction of the edge does not dictate the direction of the current it just fixes what
directions positive and negative values of current represent. A graph with directed edges
is known as a directed graph or digraph for short. The edge directions are represented by
arrows; examples of digraphs can be seen in Figure 2.5.
Consider a digraph D consisting of n vertices connected by m edges. The incidence
matrix A associated with D is the m n matrix whose rows are indexed by the edges in D
and whose columns are indexed by the vertices. If edge k starts at vertex i and terminates
3/7/03

83

c 2003

Peter J. Olver

Figure 2.5.

Digraphs.
1

A Simple Digraph.

Figure 2.6.

at vertex j, then row k of the incidence matrix will have a + 1 in its (k, i) entry and 1
in its (k, j) entry; all other entries of the row are zero. Thus, the + 1 entries represent
starting vertices and the 1 entries the ending vertices. (Note: a vertex can be starting
for some edges and ending for others.)
A simple example is the digraph in Figure 2.6, and consists of five edges joined at four
different vertices, labeled as in the illustration. The 5 4 incidence matrix is

1 1
1 0

A = 1 0

0 1
0 0

0
1
0
0
1

0
0

1 .

1
1

(2.38)

Thus the first row of A tells us that the first edge starts at vertex 1 and ends at vertex 2.
Similarly, row 2 says that the second edge goes from vertex 1 to vertex 3. Clearly one can
completely reconstruct any digraph from its incidence matrix.
3/7/03

84

c 2003

Peter J. Olver

1
1

2
3

5
6

Another Digraph.

Figure 2.7.
Example 2.48. The matrix

1
1

0
A=
0

0
0

1
0
1
1
0
0

0
1
1
0
1
0

0
0
0
1
1
1

0
0

0
.
0

0
1

qualifies as an incidence matrix because each row contains a single +1, a single 1, and
the other entries are 0. Let us construct the digraph corresponding to A. Since A has five
columns, there are five vertices in the digraph, which we label by the numbers 1, 2, 3, 4, 5.
Since it has seven rows, there are 7 edges. The first row has its + 1 in column 1 and its
1 in column 2 and so the first edge goes from vertex 1 to vertex 2. Similarly, the second
edge corresponds to the second row of A and so goes from vertex 3 to vertex 1. The third
row of A gives an edge from vertex 3 to vertex 2; and so on. In this manner we construct
the digraph drawn in Figure 2.7.
The incidence matrix has important algebraic consequences for the graph it represents.
In particular, its kernel and cokernel have topological significance. For example, the kernel
of the incidence matrix (2.38) is spanned by the single vector
T

z = (1 1 1 1) ,
and represents the fact that the sum of the entries in any given row of A is zero; see
Exercise . This observation is a general fact for connected digraphs.
Proposition 2.49. If A is the incidence matrix for a connected digraph D, then
T
ker A is one-dimensional, with basis z = ( 1 1 . . . 1 ) .
3/7/03

85

c 2003

Peter J. Olver

Proof : If edge k connects vertices i and j, then the k th equation in A z = 0 is zi = zj .


The same equality holds, by a simple induction, if the vertices i and j are connected
by a path. Therefore, if D is connected, all the entries of z are equal, and the result
follows.
Q.E.D.
Remark : In general, dim ker A equals the number of connected components in the
digraph D. See Exercise .
Corollary 2.50. If A is the incidence matrix for a connected digraph D with n
vertices, then rank A = n 1.
Proof : This is an immediate consequence of Theorem 2.47.

Q.E.D.

Next, let us look at the cokernel of an incidence matrix. Consider the particular
example (2.38). We need to compute the kernel of the transposed incidence matrix

1
1
AT =
0
0

1
0
1
0

1
0
0
1

0
1
0
1

0
0
.
1
1

(2.39)

Solving the homogeneous system AT y = 0 by Gaussian elimination, we discover that


coker A = ker AT is spanned by the two vectors
T

y1 = ( 1 0 1 1 0 ) ,

y2 = ( 0 1 1 0 1 ) .

Each of these vectors represents a circuit in the digraph, the nonzero entries representing
the edges and the direction in which they are traversed. For example, y 1 corresponds to
the circuit that starts out along edge #1, then traverses edge #3 in the reverse direction
(which is indicated by the minus sign), and then goes along edge #4 in the proper direction.
Similarly, y2 represents the circuit consisting of edge #2, followed by edge #3, backwards,
and then edge #5. The fact that y1 and y2 are linearly independent vectors says that the
two circuits are independent. The general element of coker A is a linear combination
c1 y1 + c2 y2 . Certain values of the constants lead to other types of circuits; for example
y1 represents the same circuit as y1 , but traversed in the opposite direction. Another
example is
T
y1 y2 = ( 1 1 0 1 1 ) ,
which represents the square circuit going around the outside of the digraph, along edges
1, 4, 5, 2, the latter two being in the reverse direction. We can view this circuit as a
combination of the two triangular circuits; when we add them together the middle edge #3
is traversed once in each direction, which effectively cancels its contribution. (A similar
cancellation occurs in the theory of line integrals; see Section A.5.) Other combinations
represent virtual circuits; for instance, one can interpret 2 y1 12 y2 as two times around
the first triangular circuit plus one half of the other triangular circuit, in the opposite
direction. Whatever that means . . .
Let us summarize the preceding discussion.
3/7/03

86

c 2003

Peter J. Olver

Figure 2.8.

A Cubical Digraph.

Theorem 2.51. Each circuit in a digraph D is represented by a vector in the cokernel


of its incidence matrix, whose entries are + 1 if the edge is traversed in the correct direction,
1 if in the opposite direction, and 0 if the edge is not in the circuit. The dimension of
the cokernel of A equals the number of independent circuits in D.
These two results have an important and remarkable consequence. Suppose D is
a connected digraph with m edges and n vertices and A its m n incidence matrix.
Corollary 2.50 implies that A has rank r = n 1 = n dim ker A. On the other hand,
Theorem 2.51 says that dim coker A = l equals the number of independent circuits in
D. The Fundamental Theorem 2.47 says that r = m l. Equating these two different
computations of the rank, we find r = n 1 = m l, or n + l = m + 1. This celebrated
result is known as Eulers formula for graphs, first discovered by the extraordinarily prolific
eighteenth century Swiss mathematician Leonhard Euler (pronounced Oiler).
Theorem 2.52. If G is a connected graph, then
# vertices + # independent circuits = # edges + 1.

(2.40)

Remark : If the graph is planar , meaning that it can be graphed in the plane without
any edges crossing over each other, then the number of independent circuits is equal to
the number of holes in the graph, i.e., the number of distinct polygonal regions bounded
by the edges of the graph. For example, the pentagonal digraph in Figure 2.7 bounds
three triangles, and so has three independent circuits. For non-planar graphs, (2.40) gives
a possible definition of the number of independent circuits, but one that is not entirely
standard, [25]. A more detailed discussion relies on further developments in the topological
properties of graphs.
Example 2.53. Consider the graph corresponding to the edges of a cube, as illustrated in Figure 2.8, where the second figure represents the same graph squashed down
onto a plane. The graph has 8 vertices and 12 independent edges. Therefore, Eulers
formula (3.74) tells us that there are 5 independent circuits. These correspond to the interior square and four trapezoids in the planar version of the digraph, and hence to circuits
3/7/03

87

c 2003

Peter J. Olver

around 5 of the 6 faces of the cube. The missing face does indeed define a circuit, but
it can be represented as the sum of the other five circuits, and so is not independent. In
Exercise , the reader is asked to write out the incidence matrix for the cubical digraph
and explicitly identify the basis of its kernel with the circuits.
We do not have the space to develop the connections between graph theory and linear
algebra in more detail. The interested reader can find details in texts on graph theory,
e.g., [25].

3/7/03

88

c 2003

Peter J. Olver

Chapter 3
Inner Products and Norms
The geometry of Euclidean space is founded upon the basic notions of length and
angle. The abstract concept of a norm on a vector space formalizes the geometrical notion of the length of a vector. In Euclidean geometry, the angle between two vectors is
determined by their dot product, which is itself formalized by the abstract concept of an
inner product. Inner products and norms lie at the heart of mathematical analysis, in
both finite-dimensional vector spaces and infinite-dimensional function spaces of functions
and it is impossible to overemphasize their importance for both theoretical developments, practical applications, and the design of numerical solution algorithms. In essence,
all areas of applied mathematics require the introduction of a suitable norm and/or inner
product on the underlying vector space.
Mathematical analysis relies on certain basic inequalities. For any inner product
space, the most basic is the CauchySchwarz inequality, which has a relatively simple
abstract proof that highlights its essential features and avoids the seeming complications
that ensue from the explicit formulas. The triangle inequality for the associated norm is
a simple consequence; for more general norms, this inequality forms part of the definition.
These inequalities are equally valid in both finite-dimensional and infinite-dimensional
vector spaces.
In Euclidean space R n , the characterization of general inner products will lead us
to the extremely important class of positive definite matrices. Positive definite matrices
appear in a variety of applications, including minimization problems, least squares, mechanics, electrical circuits, and the differential equations describingt dynamical processes.
Later in this book, we will generalize the notion of positive definiteness to differential operators, and see how these underly the partial differential equations of continuum mechanics
and dynamics.
In the final section, we formally introduce complex numbers and complex vector
spaces. Most of the construction is in direct analogy with the real version, but the notion
of an inner product and consequent norm on a complex vector space necessitates some
additional thought. Applications, particularly in fourier analysis, will be forthcoming in
later chapters.

3.1. Inner Products.


For most of this chapter until the final section V will denote a real vector space.
The definition of an inner product is modeled on the familiar dot product between two
vectors in R n ; see (3.2) below.
3/7/03

89

c 2003

Peter J. Olver

Definition 3.1. An inner product on the real vector space V is a pairing that takes
two vectors v, w V and produces a real number h v ; w i R. The inner product is
required to satisfy the following axioms for all u, v, w V , and c, d R.
(i ) Bilinearity:
h c u + d v ; w i = c h u ; w i + d h v ; w i,
(ii )

Symmetry:

h u ; c v + d w i = c h u ; v i + d h u ; w i.
h v ; w i = h w ; v i.

(iii )

Positivity:
hv;vi > 0

whenever

v 6= 0,

while

h 0 ; 0 i = 0.

A vector space equipped with an inner product is called an inner product space. As
we shall see, a given vector space can admit many different inner products.
Given an inner product, the associated norm of a vector v V is defined as the
positive square root of the inner product of the vector with itself:
p
kvk = hv;vi .
(3.1)
The positivity axiom implies that k v k 0 is real and non-negative, and equals 0 if and
only if v = 0 is the zero vector.

Example 3.2. The simplest example of an inner product is the standard Euclidean
dot product
n
X
vi wi ,
(3.2)
h v ; w i = v w = v 1 w1 + + v n wn =
i=1

between (column) vectors v = ( v1 , v2 , . . . , vn ) , w = ( w1 , w2 , . . . , wn ) in R n . The


bilinearity and symmetry properties are immediate. Positivity is also easy, since
hv;vi =

n
X

i=1

vi2 0

is a sum of squares, and hence always positive, unless, of course, v = 0 is the zero vector.
Therefore, the Euclidean norm is found by taking the square root of the sum of the squares
of the vector entries:
v
u n
uX 2
vi ,
(3.3)
kvk = t
i=1

and serves to measure the standard, Euclidean length of vectors in R n . This formula generalizes the classical Pythagorean Theorem to n-dimensional Euclidean space; see Figure 3.1.
An important observation is that one can identify the dot product (3.2) with the
matrix product
w
1
w2

(3.4)
v w = v T w = ( v 1 v2 . . . v n )
..
.
wn
3/7/03

90

c 2003

Peter J. Olver

v3

v2
v2

v1

v1

Figure 3.1.

The Euclidean Norm in R 2 and R 3 .

of a row vector vT and a column vector w.


Example 3.3. While certainly the most important inner product on R n , the dot
product is by no means the only possibility. For example, the standard dot product on
V = R 2 is simply


v1
w1
v w = v 1 w1 + v 2 w2 ,
v=
,
w=
.
(3.5)
v2
w2
Consider the alternative expression

h v ; w i = v 1 w1 v 1 w2 v 2 w1 + 4 v 2 w2 .

(3.6)

It is not hard to see that this pairing satisfies the bilinearity axiom. The symmetry axiom
is immediate. Finally, positivity is ensured by noticing that
h v ; v i = v12 2 v1 v2 + 4 v22 = (v1 v2 )2 + 3 v22 > 0

is strictly positive for all nonzero v 6= 0. Therefore, (3.6) defines an alternative inner
product on R 2 . The associated norm
q
kvk =
v12 2 v1 v2 + 4 v22

defines a different notion of distance and consequential non-Pythagorean plane geometry.

Example 3.4. Let d1 , . . . , dn be a set of positive numbers. The corresponding


weighted inner product and weighted norm on R n are defined by
v
u n
n
X
uX
(3.7)
di v i w i ,
kvk = t
di vi2 .
hv;wi =
i=1

i=1

The numbers di > 0 are the weights. The larger the weight di , the more the ith coordinate
of v contributes to the norm. Weighted norms are particularly important in statistics and
data fitting, where one wants to emphasize certain quantities and de-emphasize others;
this is done by assigning suitable weights to the different components of the data vector
v. See Section 4.4 on least squares approximation methods for more details.
3/7/03

91

c 2003

Peter J. Olver

Inner Products on Function Space


Inner products and norms on function spaces will play an essential role in the development of Fourier analysis and the solution to boundary value problems for both ordinary
and partial differential equations. Let us introduce the most important inner products on
function space.
Example 3.5. Given a bounded closed interval [ a, b ] R, consider the vector space
C = C0 [ a, b ] consisting of all continuous functions f : [ a, b ] R. The integral
0

hf ;gi =

f (x) g(x) dx

(3.8)

defines an inner product on the vector space C0 , as we shall prove below. The associated
norm is, according to the basic definition (3.1),
s
Z b
kf k =
f (x)2 dx .
(3.9)
a

This quantity is known as the L2 norm of the function f over the interval [ a, b ]. The L2
norm plays the same role in infinite-dimensional function space that the Euclidean norm
or length of a vector plays in the finite-dimensional Euclidean vector space R n .
For example, if we take [ a, b ] = [ 0, 21 ], then the inner product between f (x) = sin x
and g(x) = cos x is equal to
h sin x ; cos x i =

/2
0

/2

1
1
2
= .
sin x cos x dx = sin x
2
2
x=0

Similarly, the norm of the function sin x is


s
r
Z /2

2
k sin x k =
.
(sin x) dx =
4
0

One must always be careful when evaluating norms; for example, the constant function
c(x) 1 has norm
s
r
Z /2

2
,
1 dx =
k1k =
2
0
not 1 as the reader might have expected. We also note that the value of the norm depends
upon which interval the integral is taken over! For instance, on the longer interval [ 0, ],
sZ

k1k =
12 dx = .
0

Thus, when dealing with the L2 inner product or norm, one must always be careful to
specify the function space, or, equivalently, the interval on which it is being evaluated.
3/7/03

92

c 2003

Peter J. Olver

Let us prove that formula (3.8) does, indeed, define an inner product. First, we need
to check that h f ; g i is well-defined; this follows because the product f (x) g(x) of two
continuous functions is also continuous, and hence its integral over a bounded interval is
defined and finite. The symmetry condition for the inner product is immediate:
Z b
hf ;gi =
f (x) g(x) dx = h g ; f i,
a

because multiplication of functions is commutative. The first bilinearity axiom


h c f + d g ; h i = c h f ; h i + d h g ; h i,

amounts to the following elementary integral identity


Z
Z b
Z b

f (x) h(x) dx + d
c f (x) + d g(x) h(x) dx = c
a

g(x) h(x) dx,


a

valid for arbitrary continuous functions f, g, h and scalars (constants) c, d. The second
bilinearity axiom is proved similarly; alternatively, one can use symmetry to deduce it
from the first. Finally, positivity requires that
Z b
2
kf k = hf ;f i =
f (x)2 dx 0.
a

This is clear because f (x) 0, and the integral of a nonnegative function is nonnegative.
Moreover, since the function f (x)2 is continuous and nonnegative, its integral will vanish,
Z b
f (x)2 dx = 0 if and only if f (x) 0 is the zero function; see Exercise for the proof.
a

This completes the demonstration.

Remark : The preceding construction holds for more general functions, but we have
restricted our attention to continuous functions to avoid certain technical complications.
The most general function space admitting this important inner product is known as
Hilbert space, which forms the foundation for the rigorous theory of Fourier series, and
also lies at the heart of modern quantum mechanics. We will discuss some of the details of
the Hilbert space construction later on. One does need to be extremely careful when trying
to extend the inner product to more general functions. Indeed, there are discontinuous
functions with zero L2 norm. For instance, the function

Z 1
1,
x = 0,
2
satisfies
kf k =
f (x)2 dx = 0
(3.10)
f (x) =
0,
otherwise,
1
because any function which is zero except at finitely many (or even countably many) points
has zero integral.

One can also define weighted inner products on the function space C0 [ a, b ]. The
role of the weight is played by a (continuous) positive scalar function w(x) > 0. The
corresponding weighted inner product and norm are
s
Z b
Z b
(3.11)
hf ;gi =
f (x) g(x) w(x) dx,
kf k =
f (x)2 w(x) dx .
a

3/7/03

93

c 2003

Peter J. Olver

Figure 3.2.

Angle Between Two Vectors.

The verification of the inner product axioms in this case is left as an exercise for the reader.

3.2. Inequalities.
Returning to the general framework of inner products on vector spaces, we now prove
the most important inequality in applied mathematics. Its origins can be found in the
geometric interpretation of the dot product on Euclidean space in terms of the angle
between vectors.
The CauchySchwarz Inequality
In two and three-dimensional Euclidean geometry, the dot product between two vectors can be geometrically characterized by the equation
v w = k v k k w k cos ,

(3.12)

where measures the angle between the vectors v and w, as depicted in Figure 3.2. Since
| cos | 1,
the absolute value of the dot product is bounded by the product of the lengths of the
vectors:
| v w | k v k k w k.
This basic inequality, named after two of the founders of modern analysis, Augustin
Cauchy and Herman Schwarz, holds, in fact, for any inner product.
Theorem 3.6. Every inner product satisfies the CauchySchwarz inequality
| h v ; w i | k v k k w k,

v, w V.

(3.13)

Russians also give priority for its discovery to their compatriot Viktor Bunyakovskii, and,
indeed, many authors append his name to the inequality as well.

3/7/03

94

c 2003

Peter J. Olver

Here k v k is the associated norm, while | | denotes absolute value. Equality holds if and
only if v and w are parallel vectors.
Proof : The case when v = 0 or w = 0 is trivial, since both sides of (3.13) are equal
to 0. Thus, we may suppose w 6= 0. Let t R be an arbitrary scalar. Using the three
basic inner product axioms, we have
0 k v + t w k 2 = h v + t w ; v + t w i = k v k 2 + 2 t h v ; w i + t 2 k w k2 ,

(3.14)

with equality holding if and only if v = t w which requires v and w to be parallel


vectors. We fix v and w, and consider the right hand side (3.14) as a quadratic function,
p(t) = k w k2 t2 + 2 h v ; w i t + k v k2 ,
of the scalar variable t. Since the coefficient of t2 is positive, p(t) has a minimum where
its derivative vanishes:
p0 (t) = 2 k w k2 t + 2 h v ; w i = 0,

which occurs when

t=

hv;wi
.
k w k2

Substituting this particular minimizing value into (3.14), we find


0 k v k2 2

h v ; w i2
h v ; w i2
h v ; w i2
2
+
=
k
v
k

.
k w k2
k w k2
k w k2

Rearranging this last inequality, we conclude that


h v ; w i2
k v k2 ,
2
kwk

h v ; w i 2 k v k 2 k w k2 .

or

Taking the (positive) square root of both sides of the final inequality completes the theorems proof.
Q.E.D.
Therefore, given any inner product on a vector space, we can use the quotient
cos =

hv;wi
kvk kwk

(3.15)

to define the angle between the elements v, w V . The CauchySchwarz inequality tells
us that the right hand ratio lies between 1 and + 1, and hence the angle is well-defined,
and, in fact, unique if we restrict it to lie in the range 0 .
For example, using the standard dot product, the angle between the vectors v =
T
T
( 1 0 1 ) and w = ( 0 1 1 ) in R 3 is given by
cos =

1
1
= ,
2
2 2

Two vectors are parallel if and only if one is a scalar multiple of the other. The zero vector
is parallel to every other vector, by convention.

3/7/03

95

c 2003

Peter J. Olver

and so = 13 (i.e., 60 ) or 35 (i.e., 300 ), depending on which direction one measures


the angle. Similarly, the angle between the polynomials p(x) = x and q(x) = x 2 defined
on the interval I = [ 0, 1 ] is given by
Z 1
r
1
x3 dx
h x ; x2 i
15
4
0
q
s
q
s
cos =
=
,
=
=
Z 1
Z 1
k x k k x2 k
16
1
1
3
5
x2 dx
x4 dx
0

so that = 0.25268 . . . radians.


Warning: One should not try to give this notion of angle between functions more
significance than the formal definition warrants it does not correspond to any angular
properties of their graph. Also, the value depends on the choice of inner product and the
interval upon which it is being computed.
Z For example, if we change to the inner product
1

x3 dx = 0, and hence (3.15) becomes cos = 0,

on the interval [ 1, 1 ], then h x ; x i =

so the angle between x and x2 is now =

1
2

Even in Euclidean space R n , the measurement of angle (and length) depends upon
the choice of an underlying inner product. Different inner products lead to different angle
measurements; only for the standard Euclidean dot product does angle correspond to our
everyday experience.
Orthogonal Vectors
A particularly important geometrical configuration occurs when two vectors are perpendicular , which means that they meet at a right angle: = 2 or 3
2 . This occurs if and
only if cos = 0. Equation (3.15) implies that the vectors v, w are perpendicular if and
only if their dot product vanishes: v w = 0.
This particular configuration also plays a key role in general inner product spaces. For
historical reasons, the proper technical term is orthogonal instead of perpendicular.
Definition 3.7. Two elements v, w V of an inner product space V are called
orthogonal if h v ; w i = 0.
Orthogonality is a powerful and important tool in all applications of linear algebra,
and we devote Chapter 5 to its detailed development.
T

Example 3.8. The vectors v = ( 1 2 ) and w = ( 6 3 ) are orthogonal with


respect to the Euclidean dot product in R 2 , since

6
T
T
v w = v w = (1 2)
= 1 6 + 2 (3) = 0.
3

Therefore, they meet at a 90 angle. Note that these vectors are not orthogonal with
respect to the modified inner product (3.6):


6
1
= 1 6 1 (3) 2 6 + 4 2 (3) = 27 6= 0.
;
hv;wi =
3
2
3/7/03

96

c 2003

Peter J. Olver

v+w
w

Triangle Inequality.

Figure 3.3.

Thus, orthogonality, like angles in general, depends upon which inner product is being
used.
Example 3.9. The polynomials
q(x) = x2 12 ,
Z 1
p(x) q(x) dx on the interval
are orthogonal with respect to the inner product h p ; q i =
p(x) = x

and

[ 0, 1 ], since

x; x

1
2

x x

1
2

dx =

1
0

x3

x3

1
2

x dx = 0.

They fail to be orthogonal on most other intervals. For example, on the interval [ 0, 2 ],

x; x

1
2

2
0

x x

1
2

dx =

2
0

1
2

x dx = 3.

The Triangle Inequality


The more familiar triangle inequality is an easy consequence of the CauchySchwarz
inequality. Referring to Figure 3.3, it states that the length of one side of a triangle is at
most equal to the sum of the lengths of the other two sides. In vector language, if the two
side are represented by vectors v and w, then the third corresponds to their sum v + w.
The same property applies to general norms.
Theorem 3.10. The norm associated with an inner product satisfies the triangle
inequality
kv + wk kvk + kwk
(3.16)
for every v, w V . Equality holds if and only if v and w are parallel vectors.
3/7/03

97

c 2003

Peter J. Olver

Proof : We compute
k v + w k2 = k v k2 + 2 h v ; w i + k w k 2

2
k v k2 + 2 k v k k w k + k w k 2 = k v k + k w k .

The middle inequality is a consequence of the CauchySchwarz inequality. Taking square


roots of both sides and using positivity completes the proof.
Q.E.D.
T

Example 3.11. Consider the vectors v = ( 1 2 1 ) and w = ( 2 0 3 ) , with

T
sum v + w = ( 3 2 2 ) . Their Euclidean norms are k v k = 6 and k w k = 13, while

k v + w k = 17. The triangle inequality (3.16) in this case says 17 6 + 13, which
is a valid inequality.
Example 3.12. Consider the functions f (x) = x 1 and g(x) = x2 + 1. Using the
L2 norm on the interval [ 0, 1 ], we find
s
s
r
r
Z 1
Z 1
1
23
2
2
2
,
kgk =
,
kf k =
(x 1) dx =
(x + 1) dx =
3
15
0
0
s
r
Z 1
77
kf + gk =
.
(x2 + x)2 dx =
60
0
q
q
q
1
23

+
The triangle ineuqality requires 77
60
3
15 , which is true.

The CauchySchwarz and triangle inequalities look much more impressive when written out in full detail. For the Euclidean inner product (3.2), they are
v
v
n

u n
u n
X

uX 2 uX

vi wi t
vi t
wi2 ,

i=1
i=1
i=1
(3.17)
v
v
v
u n
u n
u n
uX
uX 2
uX 2
t
(vi + wi )2 t
vi + t
wi .
i=1

i=1

i=1

Theorems 3.6 and 3.10 imply that these inequalities are valid for arbitrary real numbers
v1 , . . . , vn , w1 , . . . , wn . For the L2 inner product (3.9) on function space, they produce the
following very impressive integral inequalities:
s
s

Z
Z b
Z b

2
f (x) dx
g(x)2 dx ,
f (x) g(x) dx

a
a
a
(3.18)
s
s
s
Z b
Z b
Z b

2
f (x)2 dx +
f (x) + g(x) dx
g(x)2 dx ,
a

which are also valid for arbitrary continuous (and even more general) functions. The
first of these is the original CauchySchwarz inequality, whose proof appeared to be quite
3/7/03

98

c 2003

Peter J. Olver

deep when it first appeared in the nineteenth century. Only after the abstract notion of
an inner product space was properly formulated did its innate simplicity and generality
become evident. One can also generalize either of these sets of inequalities to weighted
inner products. For example, one can replace the integration element dx by a weighted
element w(x) dx, provided w(x) > 0, in both inequalities (3.18).

3.3. Norms.
Every inner product gives rise to an associated norm that measures the magnitude
or size of the elements of the underlying vector space. In applications, other notions of
magnitude often play a useful role, but do not arise from an inner product. To handle such
quantities, we shall extract the properties that do not rely on the inner product structure,
and thereby formulate the general definition of a norm on a vector space.
Definition 3.13. A norm on the vector space V assigns a non-negative real number
k v k 0 to each vector v V , subject to the following axioms for all v, w V , and c R:
(i ) Positivity: k v k 0, with k v k = 0 if and only if v = 0.
(ii ) Homogeneity: k c v k = | c | k v k.
(iii ) Triangle inequality: k v + w k k v k + k w k.
As we learned, every inner product gives rise to a norm. Indeed, positivity of the
norm is one of the inner product axioms. The homogeneity property follows since
p
p
hcv;cvi =
c2 h v ; v i = | c | k v k.
kcvk =
Finally, the triangle inequality for an inner product norm was established in Theorem 3.10.
Not every norm arises from an inner product. Here are some important examples.
T

Example 3.14. Let V = R n . The 1norm of a vector v = ( v1 v2 . . . vn )


defined as the sum of the absolute values of its entries:
k v k1 = | v1 | + + | vn |.

is

(3.19)

The max or norm is equal to the maximal entry (in absolute value):
k v k = sup { | v1 |, . . . , | vn | }.

(3.20)

Verification of the positivity and homogeneity properties for these two norms is straightforward; the triangle inequality is an elementary consequence of the basic inequality
|a + b| |a| + |b|
for absolute values.
The Euclidean norm, 1norm, and norm on R n are just three representatives of
the general pnorm
v
u n
uX
p
| v i |p .
(3.21)
k v kp = t
i=1

3/7/03

99

c 2003

Peter J. Olver

This quantity defines a norm for any 1 p < . The norm is a limiting case of
the pnorm as p . Note that the Euclidean norm (3.3) is the 2norm, and is often
designated as such. It is the only pnorm which comes from an inner product. The
positivity and homogeneity properties of the pnorm are straightforward. The triangle
inequality, however, is not trivial; in detail, it reads
v
v
v
u n
u n
u n
X
uX
u
uX
p
p
p
p
p
t
t
| vi + wi |
| vi | + t
| w i |p ,
(3.22)
i=1

i=1

i=1

and is known as the H


older inequality. A proof is outlined in the exercises.

Example 3.15. There are analogous norms on the space C0 [ a, b ] of continuous


functions on an interval [ a, b ]. Basically, one replaces the previous sums by integrals.
Thus, the Lp norm is defined as
s
Z b
p
p
k f kp =
| f (x) | dx .
(3.23)
a

In particular, the L1 norm is given by integrating the absolute value of the function:
k f k1 =

b
a

| f (x) | dx.

(3.24)

The L2 norm (3.9) appears as a special case, p = 2, and, again, is the only one arising
from an inner product. The proof of the general triangle or Holder inequality for p =
6 1, 2

is again not trivial. The limiting L norm is defined by the maximum


k f k = max { | f (x) | | a x b } .

(3.25)

The triangle inequality in the latter case is an easy exercise.


Every norm defines a distance between vectors, namely
d(v, w) = k v w k.

(3.26)

For the standard dot product norm, we recover the usual notion of distance in Euclidean
space. Other types of norms produce alternative (and sometimes useful) notions of distance that, nevertheless, satisfy all the familiar distance axioms. Notice that distance is
symmetric, d(v, w) = d(w, v). Moreover, d(v, w) = 0 if and only if v = w. The triangle
inequality implies that
d(v, w) d(v, z) + d(z, w)
for any triple of vectors v, w, z.

for discontinuous functions, one replaces the maximum by the essential supremum, cf. [ 105 ].

3/7/03

100

c 2003

Peter J. Olver

Example 3.16. Consider the polynomial p(x) = 3 x2 2 on the interval 1 x 1.


Its L2 norm is
sZ
r
1
18
= 1.8974 . . . .
k p k2 =
(3 x2 2)2 dx =
5
1
Its L norm is

k p k = max 3 x2 2 1 x 1 = 2,

with the maximum occurring at x = 0. Finally, its L1 norm is


Z 1
k p k1 =
| 3 x2 2 | dx
1
Z 2/3
Z 2/3
Z 1
2
2
(3 x 2) dx + (2 3 x ) dx +
=
1

q
4
3

2
3

8
3

2
3

2/3

q
4
3

2
3

2/3

16
3

2
3

(3 x2 2) dx

2 = 2.3546 . . . .

Unit Vectors
Let V be a fixed normed vector space. The elements of norm or length equal to 1 play
a special role. In general, a unit vector (or function) is an element u V that has unit
norm k u k = 1. The following easy lemma shows how to construct a unit vecotr pointing
in the same direction as any given nonzero vector.
Lemma 3.17. If v 6= 0 is any nonzero vector, then the vector u = v/k v k obtained
by dividing v by its norm is a unit vector parallel to v.
Proof : We compute, making use of the homogeneity property of the norm:

v kvk

kuk =
k v k = k v k = 1.

This completes the proof.

Q.E.D.

Example 3.18. The vector v = ( 1, 2 ) has length k v k2 = 5 with respect to


the standard Euclidean norm. Therefore, the unit vector pointing in the same direction as
v is

1 !
1
v
1
5
=
u=
.
=
2
k v k2
5
2
5

On the other hand, for the 1 norm, k v k1 = 3, and so


v
1
u=
=
k v k1
3

1
2

1
3

32

is the unit vector parallel to v in the 1 norm. finally, k v k = 2


3/7/03

101

c 2003

Peter J. Olver

-1

0.5

0.5

0.5

-0.5

0.5

-1

-0.5

0.5

-1

-0.5

0.5

-0.5

-0.5

-0.5

-1

-1

-1

Figure 3.4.

Unit Spheres for 1, 2 and Norms in R 2 .

Similarly, on the interval [ 0, 1 ], the quadratic polynomial p(x) = x 2 21 has L2 norm


s
s
r
Z 1
Z 1
2 1 2
4

7
2
1
kpk =
x 2 dx =
x x + 4 dx =
.
60
0
0

Therefore,

u(x) =

p(x)
=
kpk

is a unit polynomial, meaning that k u k =

60
7

x2

15
7

u(x)2 dx = 1, which is parallel to (or,


0

more correctly, a scalar multiple of) the polynomial p.


Remark : The notion of unit vector or function always depends upon which inner
product, and hence which norm, is being used.
The unit sphere for the given norm is defined as the set of all unit vectors

S1 = k u k = 1 V.

(3.27)

Thus, the unit sphere for the Euclidean norm on R n is the usual round sphere

S1 = k x k2 = x21 + x22 + + x2n = 1 .

For the norm, it is the unit cube

S1 = { x R n | x1 = 1 or x2 = 1 or . . . or xn = 1 } .
For the 1 norm, it is the unit diamond or octahedron
S1 = { x R n | | x 1 | + | x 2 | + + | x n | = 1 } .
See Figure 3.4 for the two-dimensional pictures.
If V is a finite-dimensional normed vector space, then the unit sphere S1 forms a
compact subset, meaning that it is closed and bounded. This topological fact, which is
not true in infinite-dimensional spaces, underscores the fundamental distinction between
3/7/03

102

c 2003

Peter J. Olver

finite-dimensional vector space theory and the vastly more complicated infinite-dimensional
case.
Equivalence of Norms
While there are many different types of norms, in a finite-dimensional vector space they
are all more or less equivalent. This turns out to be a consequence of the aforementioned
compactness property of the unit sphere.
Theorem 3.19. Let k k1 and k k2 be any two norms on R n . Then there exist
positive constants c? , C ? > 0 such that

for every v R n .

c? k v k 1 k v k 2 C ? k v k 1

(3.28)

Proof : We just sketch the basic idea, leaving the details to a more rigorous real analysis course, cf. [106, 105]. We begin by noting that a norm defines a continuous function
f (v) = k v k on R n . (Continuity is, in fact, a consequence of the triangle inequality.) Let
S1 = { u | k u k1 = 1 } denote the unit sphere of the first norm. Any continuous function
on a compact set achieves both a maximum and a minimum value. Applying this result
to the second norm function restricted to the unit sphere S1 of the first norm, we can set
0 < c? = min { k u k2 | u S1 } C ? = max { k u k2 | u S1 } < .

(3.29)

These will serve as the constants in the desired inequalities (3.28). Indeed, by definition,
c? k u k 2 C ?

when

k u k1 = 1,

(3.30)

and so (3.28) holds for all u S1 . To prove the two inequalities in general, assume
v 6= 0. (The case v = 0 is trivial.) Lemma 3.17 says that u = v/k v k1 S1 is a unit
vector in the first norm: k u k1 = 1. Moreover, by the homogeneity property of the norm,
k u k2 = k v k2 /k v k1 . Substituting into (3.30) and clearing denominators completes the
proof of (3.28).
Q.E.D.
Example 3.20. For example, consider the Euclidean norm k k2 and the max norm
k k on R n . According to (3.29), the bounding constants are found by minimizing and
maximizing k u k over all unit vectors k u k2 = 1 on the (round) unit sphere. The
maximal value is obtained at the poles, when u = ek , with k ek k = 1, and so C ? =
1.
The minimal
value is obtained when u has all equal components, which means u =
1 , . . . , 1
with c? = k u k = 1n . Therefore, we find
n
n
1
k v k2 k v k k v k2 .
n

(3.31)

One can interpret these inequalities as follows. Suppose v is a vector lying on the unit
sphere in the Euclidean norm, so k v k2 = 1. Then (3.31) tells us that its norm is
bounded from above and below by 1/ n k v k 1. Therefore, the unit Euclidean

sphere sits inside the unit sphere in the norm, and outside the sphere of radius 1/ n.
Figure 3.5 illustrates the two-dimensional situation.
3/7/03

103

c 2003

Peter J. Olver

norm and 2 norm


Figure 3.5.

1 norm and 2 norm


Equivalence of Norms.

Remark : One important consequence of the equivalence of norms is that, in R n , convergence is independent of the norm. The following are all equivalent to the standard
convergence of a sequence u(1) , u(2) , u(3) , . . . of vectors in R n :
(a) the vectors converge: u(k) u? :
(k)

(b) the individual components all converge: ui u?i for i = 1, . . . , n.


(c) the difference in norms goes to zero: k u(k) u? k 0.
The last case, called convergence in norm, does not depend on which norm is chosen.
Indeed, the basic inequality (3.28) implies that if one norm goes to zero, so does any
other norm. The convergence of sequences underlies the topology of subsets of the space,
and so an important consequence is that all norms on R n induce the same topology
notions of open and closed sets, and convergence of sequences. None of this is true in
infinite-dimensional function space! A rigorous development of the underlying topological
and analysis notions of compactness, continuity, and convergence is beyond the scope of
this course. The student is encouraged to consult a text in real analysis, e.g., [105, 106],
to find the relevant definitions, theorems and proofs.

Example 3.21. Consider the infinite-dimensional vector space C0 [ 0, 1 ] consisting of


all continuous functions on the interval [ 0, 1 ]. The functions
(
1 n x,
0 x n1 ,
fn (x) =
1
0,
n x 1,
have identical L norms
k fn k = sup { | fn (x) | | 0 x 1 } = 1.
On the other hand, their L2 norm
s
s
Z 1
Z
2
k f n k2 =
[fn (x)] dx =
0

3/7/03

104

1/n
0

1
(1 n x)2 dx =
3n
c 2003

Peter J. Olver

goes to zero as n . This shows that there is no constant C ? such that


k f k C ? k f k2
for all f C0 [ 0, 1 ]. The L and L2 norms on C0 [ 0, 1 ] are not equivalent.

3.4. Positive Definite Matrices.


Let us now return to the study of inner products, and concentrate our attention on the
finite-dimensional situation. The immediate goal of this section is to determine the most
general inner product which can be placed on the finite-dimensional vector space R n . The
analysis will lead us to the extremely important class of positive definite matrices, that
will reappear in a variety of applications, including minimization problems, mechanics,
electrical circuits, and differential equations.
T
Let h ; i denote an inner product between vectors in R n . Let x = ( x1 x2 . . . xn ) ,
T
y = ( y1 y2 . . . yn ) , be any two vectors in R n . We write the vectors in terms of the
standard basis vectors e1 , . . . , en :
x = x 1 e1 + + x n en ,

y = y 1 e1 + + y n en .

(3.32)

Let us carefully analyze the three basic inner product axioms, in order. We use the
bilinearity of the inner product to compute
+
* n
n
n
X
X
X
xi yj h ei ; ej i.
yj e j =
xi e i ;
hx;yi =
i=1

Therefore we can write


hx;yi =

i,j = 1

j =1

n
X

kij xi yj = xT K y,

(3.33)

i,j = 1

where K denotes the n n matrix of inner products of the basis vectors,


kij = h ei ; ej i,

i, j = 1, . . . , n.

(3.34)

We conclude that any inner product must be expressed in the bilinear form given by
matrix multiplication (3.33). The two remaining inner product axioms will impose certain
conditions on the inner product matrix K.
The symmetry of the inner product implies that
kij = h ei ; ej i = h ej ; ei i = kji ,

i, j = 1, . . . , n,

which means that the inner product matrix K is symmetric:


K = KT .
Conversely, symmetry of K ensures symmetry of the bilinear form:
h x ; y i = xT K y = (xT K y)T = yT K T x = yT K x = h y ; x i,
3/7/03

105

c 2003

Peter J. Olver

where the second equality follows from the fact that the quantity is a scalar.
The final condition for an inner product is positivity. This requires that
2

kxk = hx;xi = x Kx =

n
X

i,j = 1

kij xi xj 0

(3.35)

for all x R n with equality if and only if x = 0. The precise meaning of this positivity
condition on the matrix K is not immediately evident, and so will be encapsulated in the
following very important definition.
Definition 3.22. An n n matrix K is called positive definite if it is symmetric,
K = K, and satisfies the positivity condition
T

xT K x > 0

for all

0 6= x R n .

(3.36)

We will sometimes write K > 0 to mean that K is a symmetric, positive definite matrix.
Warning: We will only say that a matrix is positive definite when it is symmetric.
The condition K > 0 does not mean that all the entries of K are positive. There are many
positive definite matrices which have some negative entries see Example 3.24 below.
Conversely, many symmetric matrices with all positive entries are not positive definite!
Our preliminary analysis has resulted in the following characterization of inner products on a finite-dimensional vector space.
Theorem 3.23. Every inner product on R n is given by
h x ; y i = xT K y,

for

x, y R n ,

(3.37)

where K is a symmetric, positive definite matrix.


Example
3.24. Even though some of the entries of the symmetric matrix K =
4 2
are negative, it is, nevertheless, a positive definite matrix. Indeed, we note
2 3
that
2

q(x) = xT K x = 4 x21 4 x1 x2 + 3 x22 = 2 x1 x2 + 2 x22 0

is a sum of two non-negtive quantities. Moreover, q(x) = 0 if and only if both terms are
zero, which requires that 2 x1 x2 = 0 and x2 = 0, whereby x1 = 0 also. This proves
positivity for all nonzero x, and hence K > 0 is indeed a positive definite matrix. The
corresponding inner product on R 2 is


4 2
y1
T
h x ; y i = ( x 1 x2 )
= 4 x 1 y1 2 x 1 y2 2 x 2 y1 + 3 x 2 y2 .
2 3
y2

1 2
On the other hand, despite the fact that the matrix K =
has all positive
2 1
entries, it is not a positive definite matrix. Indeed, writing out
q(x) = xT K x = x21 + 4 x1 x2 + x22 ,
3/7/03

106

c 2003

Peter J. Olver

we find, for instance, that when x = ( 1 1 ) , the function value q(x) = 2 < 0 is not
positive. These two simple examples should be enough convince the reader that determining whether a given symmetric matrix is or is not positive definite is not completely
elementary.
The expression
q(x) = xT K x =

n
X

kij xi xj ,

(3.38)

i,j = 1

is known as a quadratic form on R n . Exercise shows that the coefficient matrix K in any
quadratic form can be taken to be symmetric without any loss of generality. The quadratic
form is called positive definite if
q(x) > 0

for all

0 6= x R n .

(3.39)

Thus, a quadratic form is positive definite if and only if its coefficient matrix is.
With a little practice, it is not difficult to read off the coefficient matrix from the
explicit formula for the quadratic form.
Example 3.25. Consider the quadratic form
q(x, y, z) = x2 + 4 x y + 6 y 2 2 x z + 9 z 2
depending upon three variables. The corresponding coefficient matrix is

1 2 1
1 2
T

whereby
q(x, y, z) = ( x y z )
K=
2 6 0
2 6
1 0 9
1 0


1
x

0
y .
9
z

Note that the squared terms in q contribute directly to the diagonal entries of K, while the
mixed terms are split in half to give the symmetric off-diagonal entries. The reader might
wish to try proving that this particular matrix is positive definite by proving positivity of
T
the quadratic form: q(x, y, z) > 0 for all nonzero ( x, y, z ) R 3 .
Slightly more generally, the quadratic form and its associated symmetric matrix are
called positive semi-definite if
q(x) = xT K x 0

for all

x R n.

(3.40)

A positive semi-definite matrix may have null directions, meaning non-zero vectors z such
that q(z) = zT K z = 0. Clearly any vector z ker K that lies in the matrixs kernel
defines a null direction, but there may be others. In particular, a positive definite matrix
is not allowed to have null directions, so ker K = {0}. Proposition 2.40 implies that all
positive definite matrices are invertible.
Theorem 3.26. All positive definite matrices K are non-singular: ker K = {0}.
3/7/03

107

c 2003

Peter J. Olver

1 1
Example 3.27. The matrix K =
is positive semi-definite, but not
1 1
positive definite. Indeed, the associated quadratic form
q(x) = xT K x = x21 2 x1 x2 + x22 = (x1 x2 )2 0
is a perfect square, and so clearly non-negative. However, the elements of ker K, namely
T
the scalar multiples of the vector ( 1 1 ) , define null directions, since q(1, 1) = 0.

a b
is positive definite
Example 3.28. A general symmetric 2 2 matrix K =
b c
if and only if the associated quadratic form satisfies
q(x) = a x21 + 2 b x1 x2 + c x22 > 0

(3.41)

for all x 6= 0. Analytic geometry tells us that this is the case if and only if
a c b2 > 0,

a > 0,

(3.42)

so that the quadratic form has positive leading coefficient and positive determinant (or
negative discriminant). A direct proof of this elementary fact will appear shortly.
Gram Matrices
Symmetric matrices whose entries are inner products of elements of an inner product space play an important role. They are named after the nineteenth century Danish
mathematician Jorgen Gram, not the metric mass unit!
Definition 3.29. Let V be an inner product space, and let v1 , . . . , vn V be
elements thereof. The associated Gram matrix

h v1 ; v 1 i h v1 ; v 2 i . . . h v 1 ; v n i

h v2 ; v 1 i h v2 ; v 2 i . . . h v 2 ; v n i
.
(3.43)
K=
..
..
..

..
.

.
.
.
h vn ; v 1 i h vn ; v 2 i . . .

h v n ; vn i

is the nn matrix whose entries are the inner products between the vector space elements.
Symmetry of the inner product implies symmetry of the Gram matrix:
kij = h vi ; vj i = h vj ; vi i = kji ,

and hence

K T = K.

(3.44)

The most direct method for producing positive definite and semi-definite matrices is
through the Gram matrix construction.
Theorem 3.30. All Gram matrices are positive semi-definite. A Gram matrix is
positive definite if and only if the elements v1 , . . . , vn V are linearly independent.
3/7/03

108

c 2003

Peter J. Olver

Proof : To prove positive (semi-)definiteness of K, we need to examine the associated


quadratic form
n
X
T
kij xi xj .
q(x) = x K x =
i,j = 1

Substituting the values (3.43) for the matrix entries, we find


q(x) =

n
X

i,j = 1

h v i ; v j i x i xj .

Bilinearity of the inner product on V implies that we can assemble this summation into a
single inner product
* n
+
n
X
X
xi v i ;
xj v j
q(x) =
= h v ; v i = k v k2 0,
i=1

j =1

where
v = x 1 v1 + + x n vn
lies in the subspace of V spanned by the given vectors. This immediately proves that K
is positive semi-definite.
Moreover, q(x) = k v k2 > 0 as long as v 6= 0. If v1 , . . . , vn are linearly independent,
then v = 0 if and only if x1 = = xn = 0, and hence, in this case, q(x) and K are
positive definite.
Q.E.D.
Example 3.31. Consider the vectors

1
v1 = 2 ,
1

v2 = 0 ,
6

(3.45)

in R 3 . Using the standard Euclidean dot product, the Gram matrix is

v1 v 1 v1 v 2
6 3
K=
=
.
v2 v 1 v2 v 2
3 45
Positive definiteness implies that the associated quadratic form
q(x1 , x2 ) = 6 x21 6 x1 x2 + 45 x22 > 0
is positive for all (x1 , x2 ) 6= 0. This can be checked directly using the criterion in (3.42).

In the case of the Euclidean dot product on R m , the construction of the Gram matrix
K can be directly implemented as follows. Given vectors v1 , . . . , vn R m , let us form
the m n matrix A = ( v1 v2 . . . vn ) whose columns are the vectors in question. Owing
to the identification (3.4) between the dot product and multiplication of row and column
vectors, the (i, j) entry of K is given as the product
kij = vi vj = viT vj
3/7/03

109

c 2003

Peter J. Olver

of the ith row of the transpose AT with the j th column of A. In other words, the Gram
matrix
K = AT A
(3.46)
is the matrix

A=
2
1

product of A with its transpose. For the preceding example (3.45),

1 3

3
6 3
1 2 1
T

.
2 0 =
and so
K=A A=
0 ,
3 45
3 0 6
1 6
6

Theorem 3.30 implies that the Gram matrix K = AT A is positive definite if and only
if the columns of A are linearly independent. This implies the following result.
Proposition
(i )
(ii )
(iii )
(iv )

3.32. Given an m n matrix A, the following are equivalent:


The n n Gram matrix K = AT A is positive definite.
A has linearly independent columns.
rank A = n m.
ker A = {0}.

As we noted above, positive (semi-)definite Gram matrices can be constructed, not just
based on the dot product between vectors in Euclidean space, but with more general inner
products on more general vector spaces. Let us next consider the case of an alternative
inner product on the finite-dimensional vector space R m . As noted in Theorem 3.23, a
general inner product on R m has the form
h v ; w i = vT C w,

for

v, w R m ,

(3.47)

where C > 0 is a symmetric, positive definite m m matrix. Therefore, given n vectors


v1 , . . . , vn R m , the entries of the corresponding Gram matrix are the products
kij = h vi ; vj i = viT C vj .
If we assemble the column vectors as above into an mn matrix A = ( v 1 v2 . . . vn ), then
the Gram inner products are given by multiplying the ith row of AT by the j th column
of the product matrix C A. Therefore, the Gram matrix based on the alternative inner
product (3.47) is given by
K = AT C A.
(3.48)
Theorem 3.30 immediately implies that K is positive definite provided A has rank n.
Theorem 3.33. Suppose A is an m n matrix with linearly independent columns.
Suppose C > 0 is any positive definite m m matrix. Then the matrix K = A T C A is a
positive definite n n matrix.

We use a different letter here to distinguish the inner product matrix C from the final Gram
matrix K.

3/7/03

110

c 2003

Peter J. Olver

The Gram matrix K constructed in (3.48) arises in a wide range of applications,


including weighted least squares approximation theory, cf. Chapter 4, the study of equilibrium of mechanical and electrical systems, cf. Chapter 6. Starting in Chapter 10, we
shall look at infinite-dimensional generalizations that apply to differential equations and
boundary value problems.
Example 3.34. In the majority of applications, C = diag(c1 , . . . , cm ) is a diagonal
positive definite matrix, which requires it to have strictly positive diagonal entries c i > 0.
This choice corresponds to a weignted inner product (3.7) on R m . For example, if we set

3 0 0
C = 0 2 0,
0 0 5
T

then the weighted Gram matrix based on the vectors v1 = ( 1 2 1 ) , v2 = ( 3 0 6 ) of


Example 3.31 is

3 0 0

1 3
1 2 1
16
21
T
K =A CA=
0 2 0 2 0 =
.
3 0 6
21 207
0 0 5
1 6

Positive definiteness is a consequence of the linear independence of the vectors v 1 , v2 , or,


alternatively, can be checked directly from (3.42).
The Gram construction also carries over to inner products on function space. Here is
a particularly important example.
Example 3.35. Consider vector space C0 [ 0, 1 ] consisting of continuous functions on
the interval 0 x 1, equipped with the L2 inner product
Z 1
f (x) g(x) dx.
hf ;gi =
0

Let us construct the Gram matrix corresponding to the elementary monomial functions
1, x, x2 . We compute the required inner products
Z 1
Z 1
2
x dx = 12 ,
dx = 1,
h1;xi =
h1;1i = k1k =
0

h x ; x i = k x k2 =
2

hx ;x i = kx k =
Therefore, the Gram matrix is

x2 dx = 13 ,

1
4

x dx =
0

K = 12
1
3

3/7/03

h 1 ; x2 i =

111

1
5,

1
2
1
3
1
4

hx;x i =
1
3
1
4
1
5

x2 dx =

1
3

x3 dx =

1
4

1
0

.
c 2003

Peter J. Olver

The monomial functions 1, x, x2 are linearly independent. Therefore, Theorem 3.30 implies
that this particuular matrix is positive definite.
The alert reader may recognize this Gram matrix K = H3 as the 3 3 Hilbert matrix
that we encountered in (1.72). More generally, the Gram matrix corresponding to the
monomials 1, x, x2 , . . . , xn has entries
i

kij = h x ; x i =

xi+j dt =
0

1
,
i+j+1

i, j = 0, . . . , n.

Therefore, the monomial Gram matrix K = Hn+1 is the (n + 1) (n + 1) Hilbert matrix


(1.72). As a consequence of Theorems 3.26 and 3.33, we have proved the following nontrivial result, as promised earlier.
Theorem 3.36. The n n Hilbert matrix Hn is positive definite. In particular, Hn
is a nonsingular matrix.
Example 3.37. Let us construct the Gram matrix corresponding to the functions
1, cos x, sin x with respect to the inner product
Z
hf ;gi =
f (x) g(x) dx

on the interval [ , ]. We compute the inner products


Z
Z
2
cos x dx = 0i,
dx = 2 ,
h 1 ; cos x i =
h1;1i = k1k =

Z
Z
2
2
h cos x ; cos x i = k cos x k =
cos x dx = , h 1 ; sin x i =
sin x dx = 0,

Z
Z
2
2
h sin x ; sin x i = k sin x k =
sin x dx = , h cos x ; sin x i =
cos x sin x dx = 0.

2
Therefore, the Gram matrix is a simple diagonal matrix K = 0
0
definiteness of K is immediately evident.

0
0 . The positive

3.5. Completing the Square.


Gram matrices furnish us with a plentiful supply of positive definite matrices. However, we still do not know how to test whether a given symmetric matrix is positive definite.
As we shall soon see, the secret already appears in the particular computations in Examples
3.3 and 3.24.
The student may recall the importance of the method known as completing the
square, first in the derivation of the quadratic formula for the solution to
q(x) = a x2 + 2 b x + c = 0,
3/7/03

112

(3.49)
c 2003

Peter J. Olver

and, later, in the integration of various types of rational functions. The key idea is to
combine the first two terms in (3.49) in a square, and so rewrite the quadratic function in
the form

2
b
a c b2
= 0.
(3.50)
q(x) = a x +
+
a
a
As a consequence,

b
x+
a

The quadratic formula


x=

b2 a c
.
a2

b2 a c
a

follows by taking the square root of both sides and then solving for x. The intermediate
step (3.50), where we eliminate the linear term, is known as completing the square.
We can perform the same manipulation on the corresponding homogeneous quadratic
form
q(x1 , x2 ) = a x21 + 2 b x1 x2 + c x22 .
(3.51)
We write
q(x1 , x2 ) =

a x21

+ 2 b x 1 x2 +

c x22

b
= a x1 + x2
a

a c b2 2
a c b2 2
x2 = a y12 +
y2
a
a
(3.52)

as a sum of squares of the new variables


y1 = x 1 +

b
x ,
a 2

(3.53)

y2 = x2 .

Since y1 = y2 = 0 if and only if x1 = x2 = 0, the final expression is positive definite if and


only if both coefficients are positive:
a c b2
> 0.
a

a > 0,

This fact proves that conditions (3.42) are necessary and sufficient for the quadratic form
(3.41) to be positive definite.
How this simple idea can be generalized to the multivariable case will become clear if
we write the quadratic form identity (3.52) in matrix form. The original quadratic form
(3.51) is


a b
x1
T
q(x) = x K x,
where
K=
,
x=
.
(3.54)
b c
x2
The second quadratic form in (3.52) is
T

3/7/03

qb (y) = y D y,

where

113

a
0

0
a c b2
a

y=

y1
.
y2

c 2003

(3.55)

Peter J. Olver

Anticipating the final result, the equation connecting x and y can be written in matrix
form as

y1
x1 + (b/a) x2
1 0
T
T
.
y=L x
or
=
,
where
L = b
y2
x2
a 1
Substituting into (3.55), we find

yT D y = (LT x)T D (LT x) = xT L D LT x = xT K x,


where
K = L D LT (3.56)

a b
T
is precisely the L D L factorization of K =
that appears in (1.62). We are
b c
thus led to the important conclusion that completing the square is the same as the L D L T
factorization of a symmetric matrix , obtained thought Gaussian elimination!
Equation (3.56) is valid for all regular n n symmetric matrices, and shows how to
write the associated quadratic form as a sum of squares:
qb (y) = yT D y = d1 y12 + + dn yn2 .

(3.57)

The coefficients di are the pivots of K. In particular, Exercise proves that qb (y) > 0
is positive definite if and only if all the pivots are positive: d i > 0. Thus, when K is a
regular, symmetric matrix, the identity (3.56) proves Theorem 3.38.
Let us state the main result of the analysis that completely characterizes positive
definite matrices. Recall the definition of a regular matrix as one that can be reduced to
upper triangular form without any row interchanges. Theorem 1.31 implies that these are
the matrices admitting an L D LT factorization.
Theorem 3.38. A symmetric matrix K is positive definite if and only if it is regular
and has all positive pivots. Consequently, K is positive definite if and only if it can be
factored K = L D LT , where L is special lower triangular, and D is diagonal with all
positive diagonal entries.
The second statement follows directly from the first and Theorem 1.31. The complete
proof of this result will appear in the following section.
Example 3.39. Consider the symmetric matrix

1 2 1
K = 2 6 0 .
1 0 9

Gaussian elimination

1 0

L=
2 1
1 1

produces the factors

1 0
0

D= 0 2
0 ,
0 0
1

0
0,
6

1 2
T

L = 0 1
0 0

1
1 .
1

in the factorization K = L D LT . Since the pivots the diagonal entries 1, 2, 6 in D


are all positive definite, Theorem 3.38 implies that K is positive definite, which means
that the associated quadratic form is positive definite:
q(x) = x21 + 4 x1 x2 2 x1 x3 + 6 x22 + 9 x23 > 0,
3/7/03

114

x = ( x1 , x2 , x3 ) 6= 0.
c 2003

Peter J. Olver

Indeed, the factorization implies that the associated quadratic form can be written as a
sum of squares:
q(x) = x21 + 4 x1 x2 2 x1 x3 + 6 x22 + 9 x23 = y12 + 2 y22 + 6 y32 ,

(3.58)

where y = LT x, so
y1 = x 1 + 2 x 2 x 3 ,

y2 = x2 + x3 ,

y3 = x3 .

The fact that the coefficients of the yi2 (which are the pivots) are all positive proves that
q(x) is positive definite.
On the other hand, the matrix

1 2 3
K = 2 6 2
3 2 8
has L D LT factorization

1
1 2 3
2 6 2 = 2
3
3 2 8

1
0 0

0
1 0
0
2 1

0
2
0

3
2 .
1

1 2
0

0 1
0
0 0
9

The fact that D has a negative diagonal entry, 9, implies that K is not positive definite
even though all its entries are positive! The associated quadratic form is
q(x) = x21 + 4 x1 x2 + 6 x1 x3 + 6 x22 + 4 x2 x3 + 8 x23 ,
and, for instance, q(5, 2, 1) = 9 < 0, which immediately demonstrates that K is not
positive definite.
The only remaining issue is to show that an irregular matrix cannot be positive

defi0 1
,
nite. For example, the quadratic form corresponding to the irregular matrix K =
1 0
is q(x) = 2 x1 x2 , which is clearly not positive definite. In general, if k11 = 0, then it cannot
serve as the first pivot, and so K is not regular. But then q(e1 ) = eT1 K e1 = 0, and so K
is not positive definite. (It might be positive semi-definite, or just indefinite.)
Otherwise, if k11 6= 0, then we use Gaussian elimination to make all entries lying in
the first column below the pivot equal to zero. As remarked above, this is equivalent to
completing the square in the initial terms of the associated quadratic form
q(x) = k11 x21 + 2 k12 x1 x2 + + 2 k1n x1 xn + k22 x22 + + knn x2n

2
k12
k1n
= k11 x1 +
+ qe(x2 , . . . , xn )
x + +
x
k11 2
k11 n
where

= k11 (x1 + l21 x2 + + ln1 xn )2 + qe(x2 , . . . , xn ),


l21 =

3/7/03

(3.59)

k21
k
= 12 ,
k11
k11

...

115

ln1 =

kn1
k
= 1n ,
k11
k11
c 2003

Peter J. Olver

are precisely the multiples appearing in the first column of the lower triangular matrix L
obtained from Gaussian Elimination applied to K, while
qe(x2 , . . . , xn ) =

n
X

i,j = 2

e
k ij xi xj

is a quadratic form involving one fewer variable. The entries of its symmetric coefficient
e are
matrix K
e
k ij = e
k ji = kij lj1 k1i ,
for
i j.

e that lie on or below the diagonal are exactly the same as the entries
Thus, the entries of K
appearing on or below the diagonal of K after the the first phase of the elimination process.
In particular, the second pivot of K is the entry e
k 22 that appears in the corresponding slot
e
in K.

If qe is not positive definite, then q cannot be positive definite. Indeed, suppose that
there exist x?2 , . . . , x?n such that
qe(x?2 , . . . , x?n ) 0.

Setting

x?1 = l21 x?2 ln1 x?n ,


makes the initial square term in (3.59) equal to 0, and so
q(x?1 , x?2 , . . . , x?n ) = qe(x?2 , . . . , x?n ) 0.

In particular, if the second diagonal entry e


k 22 = 0, then qe is not positive definite, and so
neither is q. Continuing this process, if any diagonal entry of the reduced matrix vanishes,
then the reduced quadratic form cannot be positive definite, and so neither can q. This
demonstrates that if K is irregular, then it cannot be positive definite, which completes
the proof of Theorem 3.38.
Remark : Exercise describes an alternative criterion for positive definiteness that is
based on the positivity of square subdeterminants of the matrix. However, for matrices of
any size larger than 3 3, checking the determinantal criteria is quite inefficient, and we
shall exclusively rely on Gaussian elimination and checking positivity of the pivots.
The Cholesky Factorization
The identity (3.56) shows us how to write any regular quadratic form q(x) as a sum
of squares. One can push this result slightly further in the positive definite case. Since
each pivot di > 0, we can write the diagonal form (3.57) as a sum of squares with unit
coefficients:
p
2
p
2
qb (y) = d1 y12 + + dn yn2 =
d 1 y1 + +
dn yn = z12 + + zn2 ,
p
where zi = di yi . In matrix form, we are writing
p
p
qb (y) = yT D y = zT z = k z k2 ,
where
z = S y,
with
S = diag( d1 , . . . , dn )
3/7/03

116

c 2003

Peter J. Olver

Since D = S 2 , the matrix S can be thought of as a square root of the diagonal matrix
D. Substituting back into (1.58), we deduce the Cholesky factorization
K = L D L T = L C C T LT = M M T ,

where

M = LC

(3.60)

of a positive definite matrix. Note that M is a lower triangular


matrix with all positive
p
entries, namely the square roos of the pivots mii = ci = di on its diagonal. Applying
the Cholesky factorization to the corresponding quadratic form produces
q(x) = xT K x = xT M M T x = zT z = k z k2 ,

where

z = M T x.

(3.61)

1 2 1
Example 3.40. For the matrix K = 2 6 0 considered in Example 3.39,
1 0 9
T
the Cholesky formula (3.60) gives K = M M , where


1 0
0
1 0 0
1 0
0
2 0 2 2 0 .
M = LC = 2 1 0 0
0 0
1
6
2
6
1 1 1

The associated quadratic function can then be written as a sum of pure squares:
q(x) = x21 + 4 x1 x2 2 x1 x3 + 6 x22 + 9 x23 = z12 + z22 + z32 ,

where z = M T x, so

z1 = x 1 + 2 x 2 x 3 ,

z2 =

2 x2 +

2 x3 ,

z3 =

6 x3 .

3.6. Complex Vector Spaces.


Although physical applications ultimately require real answers, complex numbers and
complex vector spaces assume an extremely useful, if not essential role in the intervening analysis. Particularly in the description of periodic phenomena, the practical utility
of complex numbers and complex exponentials becomes evident, simplifying complicated
trigonometric formulae. In quantum mechanics, complex numbers are ubiquitous. The
basic physical quantities are complex wave functions. Moreover, the Schrodinger equation,
which is the basic equation governing quantum systems, is a complex partial differential
equation with complex-valued solutions.
In this section, we survey the basic facts about complex numbers and complex vector
spaces. Most of the vector space constructions are entirely analogous to the real case, and
will not be dwelled on. The one exception is the complex version of an inner product,
which does introduce some novelties not found in its simpler real counterpart. Complex
analysis (integration and differentiation of complex functions) and its applications to fluid
flows, potential theory, waves and other areas of mathematics, physics and engineering,
will be the subject of Chapter 15.
Complex Numbers
Recall that acomplex number is an expression of the form z = x + i y, where x, y
are real and i = 1. We call x = Re z the real part of z and y = Im z the imaginary
3/7/03

117

c 2003

Peter J. Olver

z
r

Complex Numbers.

Figure 3.6.

part. A real number x is merely a complex number with zero imaginary part: Im z = 0.
Complex addition and multiplication are based on simple adaptations of the rules of real
arithmetic to include the equation i 2 = 1, and so
(x + i y) + (u + i v) = (x + u) + i (y + v),
(x + i y) (u + i v) = (x u y v) + i (x v + y u).

(3.62)

Complex numbers enjoy all the usual laws of real addition and multiplication, including
commutativity: z w = w z.
T
Complex numbers x+ i y can be identified with vectors ( x, y ) R 2 in the real plane.
Complex addition (3.62) corresponds to vector addition. (However, complex multiplication
does not have a readily identifiable vector counterpart.)
Another important operation on complex numbers is that of complex conjugation.
Definition 3.41. The complex conjugate of z = x + i y is z = x i y, and so
Re z = Re z,

Im z = Im z.

Geometrically, the operation of complex conjugation coincides with reflection of the


corresponding vector through the real axis; see Figure 3.6. In particular z = z if and only
if z is real. Note that
Re z =

z+z
,
2

Im z =

zz
.
2i

(3.63)

Complex conjugation is compatible with complex arithmetic:


z + w = z + w,

z w = z w.

In particular, the product of a complex number and its conjugate


z z = (x + i y) (x i y) = x2 + y 2
3/7/03

118

(3.64)
c 2003

Peter J. Olver

is real and non-negative. Its square root is known as the modulus of the complex number
z = x + i y, and written
p
| z | = x2 + y 2 .
(3.65)
Note that | z | 0, with | z | = 0 if and only if z = 0. The modulus | z | generalizes the
absolute value of a real number, and coincides with the standard Euclidean norm in the
(x, y)plane. This implies the validity of the triangle inequality
| z + w | | z | + | w |.

(3.66)

Using the modulus, equation (3.64) can be rewritten as


z z = | z |2 .

(3.67)

Rearranging the factors, we deduce the basic reciprocal formula


1
z
=
,
z
| z |2

or, equivalently

1
x iy
= 2
,
x + iy
x + y2

(3.68)

which is well-defined except when z = 0. The general formula for complex division
w
wz
=
z
| z |2

u + iv
(x u + y v) + i (x v y u)
,
=
x + iy
x2 + y 2

or, equivalently

(3.69)

is an immediate consequence.
The modulus of a complex number,
r = |z| =

p
x2 + y 2 ,

is one component of its polar coordinate representation


x = r cos ,

y = r sin

or

z = r(cos + i sin ),

(3.70)

illustrated in Figure 3.6. The polar angle, which measures the angle that the line connecting
z to the origin makes with the horizontal axis, is known as the phase , and written
ph z = .

(3.71)

As such, the phase is only defined up to an integer multiple of 2 . We note that the
modulus and phase of a product of complex numbers can be readily computed:
| z w | = | z | | w |,

ph (z w) = ph z + ph w.

(3.72)

On the other hand, complex conjugation preserves the modulus, but negates the phase:
| z | = | z |,

ph z = ph z.

(3.73)

Another common term for the angle is argument, written arg z. For family reasons, [ 96 ],
and to avoid confusion with the argument of a function, we have chosen to use phase throughout
this text.

3/7/03

119

c 2003

Peter J. Olver

Figure 3.7.

Real and Imaginary Parts of ez .

One of the most important formulas in all of mathematics is Eulers formula


e i = cos + i sin ,

(3.74)

relating the complex exponential with the real sine and cosine functions. This basic identity
has many justifications; see Exercise for one based on power series. Eulers formula (3.74)
can be used to compactly rewrite the polar form (3.70) of a complex number as
z = r ei

where

r = | z |,

= ph z.

(3.75)

The complex conjugate identity


e i = cos( ) + i sin( ) = cos i sin ,
permits us to express the basic trigonometric functions in terms of complex exponentials:
cos =

e i + e i
,
2

sin =

e i e i
.
2i

(3.76)

These formulae can be used to simplify trigonometric identities and integrals.


The exponential of a ge neral complex number is easily derived from the basic Euler formula and the standard properties of the exponential function which carry over
unaltered to the complex domain:
ez = ex+ i y = ex e i y = ex cos y + i ex sin y.

(3.77)

Graphs of the real and imaginary parts of the complex exponential appear in Figure 3.7.
Note that e2 i = 1, and hence the exponential function is periodic
ez+2 i = ez

(3.78)

with imaginary period 2 i a reflection of the peridicity of the trigonometric functions


in Eulers formula.
3/7/03

120

c 2003

Peter J. Olver

Complex Vector Spaces and Inner Products


A complex vector space defined exactly as its real cousin, the only difference being
that we replace real scalars R by complex scalars C in the Definition 2.1. The most basic
example is the n-dimensional complex vector space C n consisting of all column vectors
T
z = ( z1 , z2 , . . . , zn ) that have n complex entries: z1 , . . . , zn C. Verification of each of
the vector space axioms is a straightforward exercise.
We can write any complex vector z = x + i y C n as a linear combination of two real
vectors x, y R n . Its complex conjugate z = x i y is obtained by taking the complex
conjugates of its individual entries zj = xj + i yj . Thus, for example, if

1 + 2i
1 2i
then
z = 3 ,
z = 3 .
5i
5 i

In particular, z R n is a real vector if and only if z = v.


Most of the vector space concepts we developed in the real domain can be straightforwardly extended to the complex regime. The one exception is the concept of an inner
product, which requires a little thought. In analysis, the most important properties of the
inner product and norm are the associated inequalities CauchySchwarz and triangle.
Since there is no natural ordering of the complex numbers, one cannot make any sense of
a complex inequality like z < w. Thus, to formulate inequalities, the norm of a complex
vector should still be a positive, real number. Summing the squares of the entries of a
complex vector will not define a norm on C n , since the result will typically be a complex
T
scalar. Moreover, some nonzero complex vectors, e.g., ( 1 i ) , would have zero norm,
violating positivity .
The correct definition is modeled on the definition of the modulus

| z | = zz
of a complex scalar z C. If, in analogy with the real definition (3.1), the quantity inside
the square root is to represent the inner product of z with itself, then we should define the
dot product between two complex numbers to be
z w = z w,

so that

z z = z z = | z |2 .

If z = x + i y and w = u + i v, then
z w = z w = (x + i y) (u i v) = (x u + y v) + i (y u x v).

(3.79)

Thus, the dot product of two complex numbers is, in general, complex. The real part of
z w is, in fact, the Euclidean dot product between the corresponding vectors in R 2 , while
the imaginary part is, interestingly, their scalar cross-product, cf. (cross2 ).

On the other hand, in relativity, the Minkowski norm is also not always positive, and
indeed the vectors with zero norm play a critical role as they lie on the light cone emanating from
the origin, [ 88 ].

3/7/03

121

c 2003

Peter J. Olver

The vector version of this construction is named after the nineteenth century French
mathematician Charles Hermite, and called the Hermitian dot product on C n . It has the
explicit formula
w
z
1

z w = z T w = z 1 w 1 + z2 w 2 + + z n w n ,

for

z2

z=
.. ,
.
zn

w2

w=
.. . (3.80)
.
wn

Pay attention to the fact that we must apply complex conjugation to all the entries of the
second vector. For example, if

1+ i
1 + 2i
z=
,
w=
,
3 + 2i
i
then
z w = (1 + i )(1 2 i ) + (3 + 2 i )( i ) = 5 4 i .
On the other hand,
w z = (1 + 2 i )(1 i ) + i (3 2 i ) = 5 + 4 i .

Therefore, the Hermitian dot product is not symmetric. Reversing the order of the vectors
results in complex conjugation of the dot product:
w z = z w.
But this extra complication does have the effect that the induced norm, namely
p

k z k = z z = zT z = | z 1 | 2 + + | z n | 2 ,

(3.81)

is strictly positive for all 0 6= z C n . For example, if

1 + 3i
p

then
kzk =
z = 2 i ,
| 1 + 3 i |2 + | 2 i |2 + | 5 |2 = 39 .
5

The Hermitian dot product is well behaved under complex vector addition:
(z + b
z) w = z w + b
z w,

b = z w + z w.
b
z (w + w)

However, while complex scalar multiples can be extracted from the first vector without
any problem, when they multiply the second vector, they emerge in complex conjugate
form:
(c z) w = c (z w),
z (c w) = c (z w),
c C.
Thus, the Hermitian dot product is not bilinear in the strict sense, but satisfies something
that, for lack of a better name, is known as sesqui-linearity.
The general definition of an inner product on a complex vector space is based on the
preceding properties of the Hermitian dot product.
3/7/03

122

c 2003

Peter J. Olver

Definition 3.42. An inner product on the complex vector space V is a pairing that
takes two vectors v, w V and produces a complex number h v ; w i C, subject to the
following requirements for all u, v, w V , and c, d C.
(i ) Sesqui-linearity:
h c u + d v ; w i = c h u ; w i + d h v ; w i,
(3.82)
h u ; c v + d w i = c h u ; v i + d h u ; w i.
(ii )

Conjugate Symmetry:
h v ; w i = h w ; v i.

(iii )

(3.83)

Positivity:
k v k2 = h v ; v i 0,

and

hv;vi = 0

if and only if

v = 0. (3.84)

Thus, in a complex inner product space, one must pay careful attention to the complex
conjugate that appears when the second argument is multiplied by a complex scalar, as
well as the complex conjugate when switching the order of the two arguments in the inner
product.
Theorem 3.43. The CauchySchwarz inequality,
| h v ; w i | k v k k w k,

v, w V.

with | | now denoting the complex modulus, and the triangle inequality
kv + wk kvk + kwk
hold for any complex inner product space.
The proof of this result is almost the same as the real case, and is left to the reader.
T

Example 3.44. The vectors v = ( 1 + i , 2 i , 3 ) , w = ( 2 i , 1, 2 + 2 i ) , have

k v k = 2 + 4 + 9 = 15,
k w k = 5 + 1 + 8 = 14,
v w = (1 + i )(2 + i ) + 2 i + ( 3)(2 2 i ) = 5 + 11 i .

Thus, the CauchySchwarz inequality says


| h v ; w i | = | 5 + 11 i | =

25 + 121 =

146

210 =

15

14 = k v k k w k.

Similarly, the triangle inequality tells us that


T

k v + w k = k ( 3, 1 + 2 i , 1 + 2 i ) k =

9+5+5=

19

15 +

14 = k v k + k w k,

which is also valid.


3/7/03

123

c 2003

Peter J. Olver

Example 3.45. Let C0 = C0 [ , ] denote the complex vector space consisting


of all complex valued continuous functions f (x) = u(x) + i v(x) depending upon a real
variable x . The Hermitian L2 inner product is defined as
Z
hf ;gi =
(3.85)
f (x) g(x) dx ,

so that the corresponding norm is


sZ

kf k =

| f (x) |2 dx =

sZ

u(x)2 + v(x)2 dx .

(3.86)

The reader should check that (3.85) satisfies the basic Hermitian inner product axioms.
For example, if k, l are integers, then the inner product of the pair of complex exponential functions e i kx and e i lx is

2,
k = l,

Z
Z

h e i kx ; e i lx i =
e i kx e i lx dx =
e i (kl)x dx =
e i (kl)x

= 0,
k 6= l.

i (k l)
x =

To evaluate the integral, we used Exercise

and the fact that

e i k = (e i )k = (1)k .

(3.87)

We conclude that when k 6= l, the complex exponentials e i kx and e i lx are orthogonal,


since their inner product is zero. This example will be of fundamental significance in the
complex version of Fourier analysis.

3/7/03

124

c 2003

Peter J. Olver

Chapter 4
Minimization and Least Squares Approximation
Because Nature likes to be efficient, many applied problems are founded on some
form of minimization principle. For example, in a mechanical system, the stable equilibrium positions are obtained by minimizing the potential energy. The geometrical problem
of minimizing distance, say from one point to another, also appears in many physical situations. For example, in optics and in relativity, light rays follow the paths of minimal
distance the geodesics on the curved spacetime. In data analysis, one option for
fitting a function to a set of sampled data points is to minimize the least squares error,
which measures the deviation between the sample data and the function. The least squares
paradigm applies to a wide range of applied mathematical systems. In particular, it underlies the theory of Fourier series, in itself of inestimable importance in mathematics, physics
and engineering. Solutions to many of the important differential equations of mathematical
physics and engineering are characterized by an underlying minimization principle. Many
numerical methods, including finite element method for boundary value problems, rely
on the minimization principle that characterizes the solution. Optimization is ubiquitous
in control theory, experimental design, linear programming, econometrics, and almost all
fields of analysis.
In this section, we study the most basic minimization problem that of a quadratic
function of several variables. The solution to this special minimization problem can be
effected by solving an associated linear system. We then discuss the applications to least
squares fitting of data and approximation of functions. Applications to equilibrium mechanics will form the focus of Chapter 6. Numerical applications will appear starting in
Chapter 10. Nonlinear minimization problems will be discussed in Section 18.3.

4.1. Minimization Problems.


Let us begin by introducing three important minimization problems one physical,
one analytical and one geometrical.
Equilibrium Mechanics
A fundamental principle of mechanics is that systems in equilibrium minimize potential energy. For example, a ball will roll downhill until it reaches the bottom, where it
minimizes its potential energy due to gravity. In general, if x R 3 denotes the position of
a particle and U (x) the potential energy function, then the equilibria are found at the local
minima of U (x). In simple situations, the energy has a unique minimal value, and that is
the only possible equilibrium position for the particle. However, in nonlinear mechanics,
one has the possibility of several minima, and hence the system can remain in equilibrium
3/7/03

124

c 2003

Peter J. Olver

in several possible configurations. For instance, consider a ball rolling on a curved surface
z = f (x, y) under a vertical gravitational force. The equilibria are at the local minima of
f the bottoms of valleys. If the energy function does not have a minimum, then the
system cannot assume an equilibrium configuration and is completely unstable the ball
has nowhere to rest. In general, there may be several possible local minima. However,
the minimization problems of interest in this chapter nondegenerate quadratic forms
have at most one minimizer.
Similarly, a pendulum will swing back and forth unless it is at the bottom of its arc,
where potential energy is minimized. Actually, the pendulum has a second equilibrium
position at the top of the arc, but this equilibrium position is unstable meaning that any
tiny movement (perturbation) will knock it off balance. Therefore, a better way of stating
this principle is that stable equilibria are where the mechanical system minimizes potential
energy. For the ball on a curved surface, the local minima are stable, while the local
maxima the tops of hills are unstable equilibria. This basic idea is fundamental
to the understanding and analysis of the equilibrium configurations of a wide range of
physical systems, including masses and springs, structures, electrical circuits, and even
continuum models of solid mechanics and elasticity, fluid mechanics, electromagnetism,
thermodynamics, statistical mechanics, and so on.
Solution of Equations
Suppose we want to solve a system of real equations
f1 (x) = 0,

f2 (x) = 0,

...

fm (x) = 0,

(4.1)

where x = (x1 , . . . , xn ) R n . This can be converted into a minimization problem in the


following seemingly silly manner. Define

2
p(x) = f1 (x) + + fm (x) .
(4.2)
Since each summand in p is positive, we clearly have p(x) 0 for all x. Moreover, p(x) = 0
if and only if each summand is zero, and hence x is a solution to (4.1). Therefore, the
minimum value of p(x) is zero, and the minimum is achieved if and only if x solves the
system (4.1).

T
We can extend this idea by setting f (x) = f1 (x), . . . , fm (x) , so that f defines a
function from R n to R m . Define
p(x) = k f (x) k2 ,

(4.3)

Ax = b

(4.4)

where k k denotes any norm on R m ; the previous formula (4.2) comes from using the
standard Euclidean norm. In view of the positivity property of the norm, the minimum
p(x? ) = 0 is achieved if and only if f (x? ) = 0 is a solution to (4.1).
The most important case is when we have a linear system

consisting of m equations in n unknowns. In this case, the solutions may be obtained by


minimizing the function
p(x) = k A x b k2 .
(4.5)
3/7/03

125

c 2003

Peter J. Olver

Of course, it is not clear that we have gained much, since we already know how to solve
A x = b by Gaussian elimination. However, this rather simple artifice has profound
consequences.
Suppose the system of equations (4.1) does not have a solution. For instance, in the
linear case (4.4), suppose b does not lie in the range of the matrix A which is very typical
if there are more equations than unknowns: m > n. Such problems arise in data fitting,
when the measured data points are all supposed to lie on a straight line, say, but rarely
do so exactly, due to experimental error. Although we know there is no exact solution to
the system, we might still want to find the best x that comes as close to solving the
system as one can. One way to measure closeness is by looking at the norm k A x b k of
the difference between the left and right hand sides of the system. The smaller the norm,
the better the attempted solution.
The least squares methodology minimizes the Euclidean norm, or, more generally,
a norm arising from an inner product on the underlying vector space. Thus, the least
squares solution to a linear system A x = b is the vector x that minimizes the designated
inner product norm of the difference k A x b k. In particular, if the system has a genuine
solution x? , then the norm achieves it minimum value of k A x b k = 0 if and only if
x = x? , and hence all true solutions qualify as least squares solutions. Least squares is
the method of choice for fitting functions to experimental data and performing statistical
analysis thereon.
The Closest Point
The following basic minimization problem arises in elementary geometry. Given a
point b R m and a subset V R m , find the closest point v? V to b. Therefore, we
seek to find the point that minimizes the distance d(b, v) = k b v k over all possible
v V . Here k k denotes the Euclidean norm, or, more generally, a norm arising from a
specified inner product on R m .
The simplest situation occurs when V is a subspace of R m , and it turns out that the
closest point problem can be reformulated as a least squares minimization problem. Let
v1 , . . . , vn be a basis for V . Our handy formula (2.15) expresses the general element of V
as a linear combination of the form
v = x1 v1 + + xn vn = A x,
where A = ( v1 v2 . . . vn ) is the m n matrix formed by the (column) basis vectors of the
subspace V . Note that we can identify V = rng A with the range of A, i.e., the subspace
spanned by its columns. Consequently, the closest point in V to the point b is found by
minimizing
k b v k2 = k b A x k 2 .
This turns out to be exactly the same as the least squares function (4.5). The solution
to the closest point problem is also the solution to the basic least squares minimization
problem!
Thus, we have deduced a fundamental principle connecting least squares solutions to
linear systems with the geometrical problem of minimizing distances to subspaces.
3/7/03

126

c 2003

Peter J. Olver

-1

-1

a>0

-1

a<0
Figure 4.1.

a=0
Parabolas.

Theorem 4.1. Assume that the matrix A has linearly independent columns. Let
V = rng A denote the range of A. Then the least squares solution x? to the system
A x = b determines the closest point v? = A x? V to b.
All three of the preceding minimization problems are solved by the same underlying
mathematical construction, which will be described in detail in Section 4.4.

4.2. Minimization of Quadratic Functions.


The simplest algebraic equations are the linear systems; these must be thoroughly understood before venturing into the far more complicated nonlinear realm. For minimization
problems, the starting point is the minimization of a quadratic function. (Linear functions
do not have minima think of the function f (x) = x + whose graph is a straight line.)
In this section, we shall see how the problem of minimizing a general quadratic function
of n variables can be solved by linear algebra techniques.
Let us begin by reviewing the very simplest example minimizing a scalar quadratic
function
p(x) = a x2 + 2 b x + c.
(4.6)
If a > 0, then the graph of p is a parabola pointing upwards, as in Figure 4.1, and so there
exists a unique minimum value. If a < 0, the parabola points downwards, and there is no
mimimum (although there is a maximum), while if a = 0, the graph is a straight line, and
there is neither minimum nor maximum.
In the case a > 0, the minimum can be found by calculus. The critical points of a
function, which are candidates for minima (and maxima), are found by setting its derivative
to zero. In this case, differentiating, and solving
p0 (x) = 2 a x + 2 b = 0,
we conclude that the only possible minimum value occurs at
x? =

b
,
a

where

p(x? ) = c

b2
.
a

(4.7)

Of course, one must check that this critical point is indeed a minimum, and not a maximum
or inflection point. The second derivative test will show that p00 (x? ) = a > 0, and so x? is
at least a local minimum.
3/7/03

127

c 2003

Peter J. Olver

A more instructive approach to this problem and one that only requires elementary
algebra is to complete the square. As was done in (3.50), we rewrite

2
b
a c b2
p(x) = a x +
+
.
(4.8)
a
a
If a > 0, then the first term is always 0, and moreover equals 0 only at x ? = b/a,
reproducing (4.7). The second term is constant, and so unaffected by the value of x.
Therefore, p(x) is minimized when the non-negative squared term in (4.8) vanishes, x +
b/a = 0. Thus, the simple algebraic identity (4.8) immediately proves that the global
minimum of p is at x? , and, moreover the minimal value p(x? ) = (a c b2 )/a is the
constant term.
Remark : For ease of exposition, we will only look for mimima. The problem of maximizing a function p(x) is easily converted into a minimization problem for p(x), and so
can be immediately solved once we learn how to minimize.
Now that we have the scalar case firmly in hand, let us turn to the problem of minimizing quadratic functions that depend on several variables. Thus, we seek to minimize a
general quadratic function
p(x) = p(x1 , . . . , xn ) =

n
X

i,j = 1

kij xi xj 2

n
X

fi xi + c

(4.9)

i=1

depending on n real variables x = ( x1 , x2 , . . . , xn ) R n . Note that p(x) is slightly more


general than a quadratic form (3.38) since it also contains linear and constant terms. We
can rewrite (4.9) in the more convenient matrix form
p(x) = xT K x 2 xT f + c.

(4.10)

Here K = (kij ) is an n n matrix, which we can take to be symmetric without loss


of generality, cf. Exercise , f is a constant vector, and c is a constant scalar. We shall
adapt our method of completing the square to analyze the minima of the general quadratic
function (4.10).
We first note that in the simple scalar case (4.6), we needed to impose the condition
that the quadratic coefficient a is positive in order to obtain a (unique) minimum. The
corresponding condition for the multivariable case is that the quadratic coefficient matrix
K be positive definite. With this assumption, we can prove a very general theorem that
has wide applicability.
Theorem 4.2. If K > 0 is a positive definite matrix, then the quadratic function
p(x) = xT K x2 xT f +c has a unique minimizer, which is the solution to the linear system
K x = f,

namely

x? = K 1 f .

(4.11)

The minimum value of p(x) is equal to any of the following expressions:


p(x? ) = p(K 1 f ) = c f T K 1 f = c f x? = c (x? )T K x? .
3/7/03

128

c 2003

(4.12)
Peter J. Olver

Proof : Suppose x? = K 1 f is the solution to (4.11). Then, for any x R n , we can


write
p(x) = xT K x 2 xT f + c = xT K x 2 xT K x? + c
(4.13)
= (x x? )T K(x x? ) + [ c (x? )T K x? ],
where we used the symmetry of K = K T to identify xT K x? = (x? )T K x. Note that the
second term in the last line does not depend on x. Moreover, since K is positive definite,
the first term is always 0 and vanishes if and only if xx? = 0. Therefore, the minimum
of p(x) occurs at x = x? . The minimum value of p(x) is equal to the second, constant
term; simple substitutions then prove the alternative expressions in (4.12).
Q.E.D.
Example 4.3. Let us illlustrate this result with a simple example. Consider the
problem of minimizing the quadratic function
p(x1 , x2 ) = 4 x21 2 x1 x2 + 3 x22 + 3 x1 2 x2 + 1
over all (real) x1 , x2 . We first write p in the matrix form (4.10), so
!

3
2
x1
4 1
T
T
+ 1,
p(x1 , x2 ) = ( x1 x2 )
2 ( x 1 x2 )
x2
1 3
1
whereby
K=

4
1

1
,
3

f=

3
2

(4.14)

(Pay attention to the overall factor of 2 in the linear terms!) According to Example 3.24,
K is positive definite, and hence p(x1 , x2 ) has a minimum which is found by solving the
linear system (4.11), namely
!

23
x1
4 1
=
.
(4.15)
x2
1 3
1
Therefore, the minimum occurs at
x? =

x?1
x?2

7
22

5
22

0.318182
0.227273

with minimal value


7 5
p(x? ) = p 22
, 22 =

13
44

0.295455.

The quickest way to compute this value is to use the second formula in (4.12).
It is instructive to compare the algebraic solution method with the general minimization technique taught in multi-variable calculus, [9]. The critical points of p(x 1 , x2 ) are
found by setting both partial derivatives equal to zero:
p
= 8 x1 2 x2 3 = 0,
x1
3/7/03

129

p
= 2 x1 + 6 x2 + 2 = 0.
x2
c 2003

Peter J. Olver

If we divide by an overall factor of 2, these are precisely the same linear equations we
already constructed in (4.15). Thus, not surprisingly, the calculus approach leads to the
same critical point. To check whether a critical point is a local minimum, we need to test
the second derivative. In the case of a function of several variables, this requires analyzing
the Hessian matrix , which is the symmetric matrix of second order partial derivatives

2p
2p

x2
8 2
x1 x2
1

H=
= 2 K,
=
2 6
2p
2p
x1 x2
x22

which is exactly twice the quadratic coefficient matrix (4.14). If the Hessian matrix is
positive definite which we already know in this case then the critical point is indeed
a (local) minimum. Thus, the calculus and algebraic approaches to this minimization
problem lead (not surprisingly) to identical results. However, the algebraic method is
more powerful, because it produces the unique global minimum, whereas calculus can only
guarantee that the critical point is a local minimum. The reader can find the full story on
minimization of nonlinear functions in Section 18.3.
Example 4.4. Let us minimize the quadratic function
p(x, y, z) = x2 + 2 x y + x z + 2 y 2 + y z + 2 z 2 + x 3 y + 11 z + 3.
We begin by rewriting it in matrix form (4.10), so

1 1 21
32
x

T
T
p(x, y, z) = ( x y z ) 1 2 12 y 2 ( x y z ) 1 + 3.
1
1
z
12
2
2
2

Thus,

K =1

1
2
1
2

1
2

1
2

f =

21

3
2
11
2

Gaussian elimination produces the L D LT factorization

1 1 12
1 0
1 0 0

1
K = 1 2 2 = 1 1 00 1
1
2

1
2

1
2

0 1

c = 3.

1
2

0 0

1
0

0 .
1

0
3
4

The pivots, i.e., the diagonal entries of D, are all positive, and hence K > 0 is positive definite. Theorem 4.2 then guarantees that p(x, y, z) has a unique minimizer. The
minimizer is found by solving the linear system K x = f . Since we already know the
L U = D LT factorization of the coefficient matrix, the solution is obtained by forward and
back substitution:
x? = 1,

y ? = 2,

z ? = 3,

with

p(x? , y ? , z ? ) = p( 1, 2, 3) = 17

giving the minimum value for p(x, y, z) over all possible x, y, z.


3/7/03

130

c 2003

Peter J. Olver

Remark : If K is not positive definite, then the quadratic function (4.10) does not have
a minimum, apart from one exceptional situation. Indeed, if w is any vector, and we let
x = t w, then
p(x) = a t2 + 2 b t + c,

where

a = wT K w 0,

b = wT f .

If a = wT K w 0, then this quadratic function has no minimum value, with one tiny
exception: if a = b = 0 then the function is constant, and every value of t trivially provides
a minimum. Note that the condition a = w T K w = 0 requires that w be a null direction
for the quadratic form defined by the matrix K, while the condition b = w f = 0 requires
that f be orthogonal to the null vector w.
With a little more work, the following necessary and sufficient conditions for the
existence of a minimum can be established.
Theorem 4.5. If K > 0 is positive definite, then the quadratic function p(x) =
x K x 2 xT f + c has a unique global minimizer x? . If K 0 is positive semi-definite,
and f is orthogonal to every null direction z ker K, then p has a global minimum x ? .
However, in the semi-definite case, the minimum is not unique since p(x ? + z) = p(x? ) for
any null vector z. In all other cases, there is no global minimum, and p(x) can assume
arbitrarily large negative values.
T

4.3. The Closest Point.


We are now in a position solve the basic geometric problem of finding the closest
point on a subspace to a given point in Euclidean space. As we saw in Section 4.1, this
will simultaneously solve the least squares minimization problem for systems of linear
equations.
Problem: Let V be a subspace of R m . Given a point b R m , find the point v? V
which minimizes the Euclidean distance k v b k over all possible points v V in the
subspace.
The minimal distance k v? b k to the closest point is called the distance from the
point b to the subspace V . Of course, if b V is in the subspace, then the answer is easy:
v? = b, with a distance of 0 from b to the subspace. Thus, the problem only becomes
interesting when b 6 V .
Example 4.6. The simplest version of this problem is to find the closest point on a
straight line L to a given point in the plane R 2 . We assume that the line goes through the
T
origin, say in the direction of a vector a = ( a1 , a2 ) . The line itself is a one-dimensional
T
subspace consisting of all scalar multiples t a = ( t a1 , t a2 ) of the vector. Given a point
T
b = ( b1 , b2 ) , the squared Euclidean distance from b to a general point v = t a on the
line is
p(t) = k b v k2 = k b t a k2 = t2 k a k2 2 t b a + k b k2 .
3/7/03

131

c 2003

Peter J. Olver

1.5

b
1

v*

0.5

-1

-0.5

0.5

1.5

-0.5

-1

Figure 4.2.

Closest Point to a Line.

The closest point is the one that minimizes this quadratic function, and so has the value
t? = (b a)/k a k2 . Therefore, the closest point on L is
v ? = t? a =

ba
a.
k a k2

(4.16)

The distance from the point to the line is the distance to the closest point, namely
q
k b k2 k a k2 (a b)2
| a 1 b2 a 2 b1 |
?
kb v k =
= p
.
(4.17)
kak
(a1 )2 + (a2 )2

For example, let us find the closest point on the line 3 y = 2 x to the point b =
T
Here a = ( 3, 2 ) , and so, by (4.16), the closest point is
7/2
a=
v =
13
?

21 7
,
26 13

2, 1

2
The distance to the line is k b v? k = . The reader may have noticed that the line
13
connecting the closest point v? to b is perpendicular (orthogonal) to the line L. This is a
general fact, and will be systematically exploited in Chapter 5.
To solve the closest point problem in general, suppose we know a basis v 1 , . . . , vn of
our subspace V R m . Then the most general vector in V has the form
v = x1 v1 + + xn vn = A x,

(4.18)

where x = ( x1 , x2 , . . . , xn ) are the coordinates of v with respect to the given basis, and
where A = ( v1 v2 . . . vn ) is the m n matrix whose columns are the basis vectors of V .
3/7/03

132

c 2003

Peter J. Olver

The goal is to minimize the Euclidean distance


k b v k2 = k b k2 2 v b + k v k2 .,

(4.19)

over all possible v V . We now substitute the formula (4.18) for v. As we shall seem the
T
resulting expression is a quadratic function of the coefficients x = ( x 1 , x2 , . . . , xn ) , and
so the minimum is provided by Theorem 4.2.
There are two ways to perform the required computation. First, if we use the linear
combination version of (4.18) and expand the resulting inner product, we find
k v k2 = (x1 v1 + + xn vn ) (x1 v1 + + xn vn ) =
Therefore,
2

kvk =

n
X

n
X

i,j = 1

xi xj v i v j .

(4.20)

kij xi xj = xT Kx,

i,j = 1

where K is the symmetric n n Gram matrix whose (i, j) entry is the dot product
kij = vi vj = viT vj ,

(4.21)

between the basis vectors of our subspace. Similarly,


v b = (x1 v1 + + xn vn ) b =
and so
vb=

n
X

i=1

n
X

i=1

xi vi b,

fi xi = f x = x T f ,

where f R m is the vector whose ith entry is the dot product


fi = v i b

(4.22)

between our point and the basis vectors. Therefore, the squared distance (4.19) equals the
quadratic function
T

p(x) = x Kx 2 x f + c =

n
X

i,j = 1

kij xi xj 2

n
X

fi xi + c,

(4.23)

i=1

where K and f are given in (4.21), (4.22), while c = k b k2 .


Alternatively, we can employ the matrix version of (4.18), leading to the alternative
formulae
k v k2 = k A x k2 = (A x)T A x = xT AT A x,
(4.24)
v b = vT b = (A x)T b = xT AT b.
Therefore, we can also compute the Gram matrix and vector f by matrix multiplication:
K = AT A,
3/7/03

f = AT b,
133

c = k b k2 .
c 2003

(4.25)
Peter J. Olver

Since we assumed that the basis vectors v1 , . . . , vn are linearly independent, Proposition 3.32 assures us that the associated Gram matrix K = AT A is positive definite.
Therefore, we may directly apply our basic minimization Theorem 4.2 to solve the closest
point problem.
Theorem 4.7. If v1 , . . . , vn form a basis for subspace V R m , then the closest
point
n
X
x?i vi V
v ? = A x? =
i=1

to a given point b R m is given by the solution x? = K 1 f to the linear system


K x = f,

(4.26)

where K and f are given in (4.25). The distance between the point and the subspace is
q
k b v? k = k b k2 f T K 1 f .
(4.27)
Example 4.8. Let V R 3 be the plane spanned by

2
1
v2 = 3 .
v1 = 2 ,
1
1
T

Let b = ( 1, 0, 0 ) . To find the closest point on


basis vectors to form the matrix

2
A=
1

the given plane to b, we first combine the

2
3 .
1

According to (4.25), the positive definite Gram matrix and associated vector are


6 3
1
T
T
K=A A=
,
f =A b=
.
3 14
2
We solve the linear system K x = f for
x? = K 1 f =

4
15
1
5

Theorem 4.7 implies that the closest point is


v? = x?1 v1 + x?2 v2 = A x? =

1
7
3 , 15 , 15

Thus, the distance from the point b to the plane is k v ? b k =


3/7/03

134

15 .
59
c 2003

Peter J. Olver

4.4. Least Squares.


As we first observed in Section 4.1, the solution to the closest point problem is also
the solution of the basic least squares minimization problem!
Definition 4.9. The least squares solution to a linear system of equations
Ax = b

(4.28)

is the vector x? R n that minimizes the Euclidean norm k A x b k.


If the system (4.28) actually has a solution, then it is automatically the least squares
solution. Thus, the concept of least squares solution is new only when the system does
not have a solution, i.e., b does not lie in the range of A. We also want the least squares
solution to be unique. For an ordinary solution, this happens if and only if ker A = {0},
or, equivalently, the columns of A are linearly independent, or, equivalently, rank A = n.
We will impose the same condition on A in our analysis of the least squares problem.
To make the connection with the closest point problem, we identify the subspace
V = rng A R m as the range or column space of the matrix A. Assuming that the
columns of A are linearly independent, they then form a basis for the range. The closest
point v? V to b is the least squares solution to (4.28), since both minimize the norm
k v b k = k A x b k over all possible v = A x V = rng A. Therefore, Theorem 4.7
immediately solves the least squares minimization problem. We state this more explicitly
by using (4.25) to write out the linear system (4.26) and the minimal distance (4.27).
Theorem 4.10. Assume ker A = {0}. Set K = AT A and f = AT b. Then the least
squares solution to A x = b is the unique solution to the normal equations
Kx = f

or

(AT A) x = AT b,

(4.29)

namely
x? = (AT A)1 AT b.

(4.30)

k A x? b k2 = k b k2 bT A (AT A)1 AT b.

(4.31)

The minimal least squares error is

Note that the normal equations (4.29) are obtained by multiplying the original system
(4.28) on both sides by AT . If A x = b has a solution, then (4.30) reduces to it. In
particular, if A is square and invertible, then (AT A)1 = A1 (AT )1 , and so (4.30) reduces
to x = A1 b, while the two terms in the error formula (4.31) cancel out, producing 0 error.
In the rectangular case when this is not allowed formula (4.30) gives a new formula
for the solution to (4.28).
Example 4.11. Consider the linear system
x1 + 2 x 2

= 1,

3 x1 x2 + x3 = 0,

x1 + 2 x2 + x3 = 1,
x1 x2 2 x3 = 2,
2 x1 + x2 x3 = 2,

3/7/03

135

c 2003

Peter J. Olver

consisting of 5 equations in 3 unknowns. The coefficient matrix and right hand side are
1
3

A = 1

1
2

2
1
2
1
1

0
1

1 ,

2
1

1
0

b = 1 .

2
2

A direct application of Gaussian elimination shows that b 6 rng A, and so the system is
incompatible it has no solution. Of course, to apply the least squares method, one is
not required to check this in advance. If the system has a solution, it is the least squares
solution too, and the least squares method will find it.
To form the normal equations (4.29), we compute

16 2 2
8
K = AT A = 2 11
2 ,
f = AT b = 0 .
2 2
7
7
Solving the 3 3 system K x = f by Gaussian elimination, we find
x = K 1 f =

69
265
556 , 278 , 278

229

( 0.4119, 0.2482, .9532 ) ,

to be the least squares solution to the system. The least squares error is
T

k b A x? k k ( 0.0917, 0.0342, 0.131, 0.0701, 0.0252 ) k 0.1799,


which is reasonably small indicating that the system is, roughly speaking, not too
incompatible.

4.5. Data Fitting and Interpolation.


One of the most important applications of the least squares minimization process is
to the fitting of data points. Suppose we are running an experiment in which we measure
a certain time-dependent physical quantity. At time ti we make the measurement yi , and
thereby obtain a set of m data points
(t1 , y1 ),

(t2 , y2 ),

...

(tm , ym ).

(4.32)

Suppose our theory indicates that the data points are supposed to all lie on a single line
y = + t,

(4.33)

whose precise form meaning its coefficients , is to be determined. Experimental


error will almost inevitably make this impossible to achieve exactly, and so the problem is
to find the straight line (4.33) which best fits the measured data.
The error between the measured value yi and the sample value predicted by the
function (4.33) is
ei = yi ( + ti ),
i = 1, . . . , m.
3/7/03

136

c 2003

Peter J. Olver

Figure 4.3.

Least Squares Approximation of Data by a Straight Line.

We can write this system in matrix form as


e = y A x,
where
e
1
e2
e=
..
.

em

y
1
y2
y=
..
.

ym

while

t1
t2
..
.

1
,
A=
.

.
.
1 tm

.
x=

(4.34)

We call e the error vector and y the data vector . The coefficients , of our desired
function (4.33) are the unknowns, forming the entries of the column vector x.
If we could fit the data exactly, so yi = + ti for all i, then each ei = 0, and we
could solve A x = y. In matrix language, the data points all lie on a straight line if and
only if y rng A. If the data points are not all collinear, then we seek the straight line
that minimizes the least squares error or Euclidean norm
q
Error = k e k = e21 + + e2m .
Pictorially, referring to Figure 4.3, the errors are the vertical distances from the points
to the line, and we are seeking to minimize the square root of the sum of the squares
of the individual errors . In vector language, we are looking for the coefficient vector
T
x = ( , ) which minimizes the Euclidean norm of the error vector
k e k = k A x y k.

(4.35)

This choice of minimization may strike the reader as a little odd.


Why not just minimize
P
the sum of the absolute value of the errors, i.e., the 1 norm k e k1 =
| ei | of the error vector,
or minimize the maximal error, i.e., the norm k e k = max{ | ei | }? Or, even better, why
minimize the vertical distance to the line? Maybe the perpendicular distance from each data
point to the line, as computed in Example 4.6, would be a better measure of error. The answer
is that, although all of these alternative minimization criteria are interesting and potentially
useful, they all lead to nonlinear minimization problems, and are much harder to solve! The least
squares minimization problem leads to linear equations, and needs to be properly understood
before moving on, in Section 18.3, to the more complicated nonlinear minimization problems.

3/7/03

137

c 2003

Peter J. Olver

Theorem 4.10 prescribes the solution to this least squares minimization problem. We
form the normal equations
(AT A) x = AT y,

with solution

x? = (AT A)1 AT y.

(4.36)

This assumes that the matrix A has linearly independent columns, which requires that
not all the ti are equal, i.e., we must measure the data at two or more different times.
(However, this does not preclude measuring some of the data at the same time, e.g., by
repeating the experiment. The reader may wish to consider why choosing all the t i s to be
the same is a silly data fitting problem.)
For the particular matrices (4.34), we compute
1

t1
t2
..
.

!
P
1

1 t
m
t
1 1 ... 1
= P

AT A =
,
P i2 = m

t1 t2 . . . tm ...
(ti )
ti
t t2
1 tm
y
1
P
!
!

2
y
1 1 ... 1
= P yi

=m
,
AT y =
t1 t2 . . . tm ...
t i yi
ty
ym

(4.37)

where the overbars, namely


m
1 X
t=
t,
m i=1 i

m
1 X
y=
y,
m i=1 i

t2

m
1 X 2
=
t ,
m i=1 i

m
1 X
ty =
t y,
m i=1 i i

(4.38)

denote the average values of the indicated variables.


Warning: The average of a product is not equal to the product of the averages! In
particular,
t2 6= ( t )2 ,
t y 6= t y.
Substituting (4.37) into the normal equations (4.36), and canceling the common factor
of m, we find that we have only to solve a pair of linear equations
t + t2 = y.

+ t = y,
The solution is
= y t ,

P
(t t ) yi
.
=
= P i
(ti t )2
t2 (t )2
t y ty

(4.39)

Therefore, the best (in the least squares sense) straight line that fits the given data is
y = (t t ) + y,
where the lines slope is given in (4.39).
3/7/03

138

c 2003

Peter J. Olver

Example 4.12. Suppose the data points are given by the table

Then

1
1
A=
1
1

Therefore

ti

yi

12

0
1
,
3
6
T

A A=

AT =

4 10
10 46

1
0

1 1
1 3

1
6

2
3
y = .
7
12

A y=

24
96

The normal equations reduce to


4 + 10 = 24,

10 + 46 = 96,

so

12
7 ,

12
7 .

Therefore, the best least squares fit to the data is the straight line
y=

12
7

t+

12
7

Alternatively, one can compute this formula directly using (4.39).


Example 4.13. Suppose we are given a sample of an unknown radioactive isotope.
At time ti we measure, using a Geiger counter, the amount mi of radioactive material in
the sample. The problem is to determine the initial amount of material and the half life.
If the measurements were exact, we would have m(t) = m0 e t , where m0 = m(0) is the
log 2
initial mass, and < 0 the decay rate. The half life of the isotope is given by t ? =
;

see Example 8.1 for additional information.


As it stands this is not a linear least squares problem, but it can be converted to that
form by taking logarithms:
y(t) = log m(t) = log m0 + t = + t.
We can thus do a linear least squares fit on the logarithm yi = log mi of the radioactive
mass at the measurement times ti to determine the best values for and = log m0 .
Polynomial Approximation and Interpolation
The basic least squares philosophy has a variety of different extensions, all interesting
and all useful. First, we can easily replace the affine function (4.33) by a quadratic function
y = + t + t2 ,

(4.40)

In this case, we are looking for the parabola that best fits the data. For example, Newtons
theory of gravitation says that (in the absense of air resistance) a falling object obeys the
3/7/03

139

c 2003

Peter J. Olver

Linear

Quadratic
Figure 4.4.

Cubic

Interpolating Polynomials.

parabolic law (4.40), where = h0 is the initial height, = v0 is the initial velocity,
and = 21 g m is one half the weight of the object. Suppose we observe a falling body,
and measure its height yi at times ti . Then we can approximate its initial height, initial
velocity and weight by finding the parabola (4.40) that best fits the data. Again, we can
find the least squares fit by minimizing the sum of the squares of errors e i = yi y(ti ).
The method can evidently be extended to a completely general polynomial function
y(t) = 0 + 1 t + + n tn

(4.41)

of degree n. The total least squares error between the data and the sample values of the
function is given by
m
X

2
2
kek =
yi y(ti ) = k y A x k2 ,
(4.42)
i=1

where

1
A=
..
.
1

t1
t2
..
.
tm

t21
t22
..
.
t2m

...
...
..
.
...

tn2

tn2

.. ,
.
tnm


x=
.2 .
.
.
n

(4.43)

In particular, if m = n + 1, then A is square, and so, assuming A is invertible, we


can solve A x = y exactly. In other words, there is no error, and the solution is an
interpolating polynomial that fits the data exactly. A proof of the following result can be
found in Exercise .
Lemma 4.14. If t1 , . . . , tn+1 are distinct, ti 6= tj , then the (n + 1) (n + 1) interpolation matrix (4.43) is nonsingular.
This result immediately implies the basic existence theorem for interpolating polynomials.
Theorem 4.15. Let t1 , . . . , tn+1 be distinct sample points. Then, for any given data
y1 , . . . , yn+1 , there exists a unique degree n interpolating polynomial (4.41) with sample
values y(ti ) = yi for all i = 1, . . . , n + 1.
Thus, two points will determine a unique interpolating line, three points a unique
interpolating parabola, four points an interpolating cubic, and so on. Examples are illustrated in Figure 4.4.
3/7/03

140

c 2003

Peter J. Olver

Example 4.16. The basic ideas of interpolation and least squares fitting of data
can be applied to approximate complicated mathematical functions by much simpler polynomials. Such approximation schemes are used in all numerical computations when
you ask your computer or calculator to compute et or cos t or any other function, it only
knows how to add, subtract, multiply and divide, and so must rely on an approximation
scheme based on polynomials! In the dark ages before computers, one would consult
precomputed tables of values of the function at particular data points. If one needed a
value at a nontabulated point, then some form of polynomial interpolation would be used
to accurately approximate the intermediate value.
For example, suppose we want to compute reasonably accurate values for the exponential function et for values of t lying in the interval 0 t 1 by using a quadratic
polynomial
p(t) = + t + t2 .
(4.44)
If we choose 3 points, say t1 = 0, t2 = .5, t3 = 1, then there is a unique quadratic polynomial
(4.44) that interpolates et at the data points, i.e.,
p(ti ) = eti

for

i = 1, 2, 3.

In this case, the coefficient matrix (4.43), namely

1 0
0
A = 1 0.5 0.25 ,
1 1
1

is invertible. Therefore, we can exactly solve the interpolation equations A x = y, where


e t1
1


y = et2 = 1.64872
e t3

2.71828

is the data vector. The solution


1.


x = = 0.876603
0.841679

yields the interpolating polynomial

p(t) = 1 + 0.876603 t + 0.841679 t2 .

(4.45)

It is the unique quadratic polynomial that agrees with et at the three specified data points.
See Figure 4.5 for a comparison of the graphs; the first graph shows e t , the second p(t), and

Actually, one could also allow interpolation and approximation by rational functions, a subject known as Pade approximation theory. See [ Pade ] for details.

3/7/03

141

c 2003

Peter J. Olver

2.5

2.5

2.5

1.5

1.5

1.5

0.5

0.5

0.5

0.2

0.4

0.6

0.8

0.2

Figure 4.5.

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Quadratic Interpolating Polynomial for et .

the third lays the two graphs on top of each other. Even with such a simple interpolation
scheme, the two functions are quite close. The L norm of the difference is

k et p(t) k = max | et p(t) | 0 t 1 .01442,

with the maximum error occurring at t .796.

There is, in fact, an explicit formula for the interpolating polynomial that is named after the influential eighteenth century ItaloFrench mathematician JosephLouis Lagrange.
It relies on the basic superposition principle for solving inhomogeneous systems Theorem 2.43. Specifically, if we known the solutions x1 , . . . , xn+1 to the particular interpolation
systems
A xk = ek ,
k = 1, . . . , n + 1,
(4.46)
where e1 , . . . , en+1 are the standard basis vectors of R n+1 , then the solution to
A x = y = y1 e1 + + yn+1 en+1
is given by the superposition formula
x = y1 x1 + + yn+1 xn+1 .
The particular interpolation equation (4.46) corresponds to interpolation data y = e k ,
meaning that yi = 0 at all points ti for i 6= k, except for yk = 1. If we can find the
n + 1 particular interpolating polynomials that realize this very special data, we can use
superposition to construct the general interpolating polynomial.
It turns out that there is a simple explicit formula for the basic interpolating polynomials.
Theorem 4.17. Given distinct values t1 , . . . , tn+1 , the k th Lagrange interpolating
polynomial is the degree n polynomial given by
Lk (t) =

(t t1 ) (t tk1 )(t tk+1 ) (t tn+1 )


,
(tk t1 ) (tk tk1 )(tk tk+1 ) (tk tn+1 )

k = 1, . . . , n + 1.
(4.47)

It is the unique polynomial of degree n that satisfies


(
1,
i = k,
i, k = 1, . . . , n + 1.
Lk (ti ) =
0,
i 6= k,
3/7/03

142

c 2003

(4.48)
Peter J. Olver

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4
0.2

0.2

0.2
0.2

0.4

0.6

0.8

0.2

0.2

L1 (t)

0.4

0.6

0.8

L2 (t)

Figure 4.6.

0.4

0.6

0.8

L3 (t)

Lagrange Interpolating Polynomials for the Points 0, .5, 1.

Proof : The uniqueness of the Lagrange interpolating polynomial is an immediate


consequence of Theorem 4.15. To show that (4.47) is the correct formula, we note that
when t = ti , i 6= k, the factor (t ti ) in the numerator of Lk (t) vanishes, while when t = tk
the numerator and denominator are equal.
Q.E.D.
Theorem 4.18. If t1 , . . . , tn+1 are distinct, then the degree n interpolating polynomial for the y1 , . . . , yn+1 is
p(t) = y1 L1 (t) + + yn+1 Ln+1 (t).

(4.49)

Proof : We merely compute


p(tk ) = y1 L1 (tk ) + + yn+1 Ln+1 (tk ) = yk ,
where, according to (4.48), every summand except the k th is zero.

Q.E.D.

Example 4.19. For example, the three quadratic Lagrange interpolating polynomials for the values t1 = 0, t2 = 21 , t3 = 1 used to interpolate et in Example 4.16 are
(t 12 )(t 1)
= 2 t2 3 t + 1,
(0 12 )(0 1)
(t 0)(t 1)
L2 (t) = 1
= 4 t2 + 4 t,
1
( 2 0)( 2 1)

L1 (t) =

(4.50)

(t 0)(t 12 )
L3 (t) =
= 2 t2 t.
(1 0)(1 12 )

Thus, one can rewrite the quadratic interpolant (4.45) to et as


y(t) = L1 (t) + e1/2 L2 (t) + e L3 (t)
= (2 t2 3 t + 1) + 1.64872( 4 t2 + 4 t) + 2.71828(2 t2 t).
We stress that this is the same interpolating polynomial we have merely rewritten it in
the more transparent Lagrange form.
3/7/03

143

c 2003

Peter J. Olver

-3

-2

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4

0.2

0.2

0.2

-1

-3

-0.2

Figure 4.7.

-2

-1

-3

-2

-1

-0.2

-0.2

Degree 2, 4 and 10 Interpolating Polynomials for 1/(1 + t 2 ).

One might expect that the higher the degree, the more accaruate the interpolating
polynomial. This expectation turns out, unfortunately, not to be uniformly valid. While
low degree interpolating polynomials are usually reasonable approximants to functions,
high degree interpolants are more expensive to compute, and, moreover, can be rather
badly behaved, particularly near the ends of the interval. For example, Figure 4.7 displays
the degree 2, 4 and 10 interpolating polynomials for the function 1/(1 + t 2 ) on the interval
3 t 3 using equally spaced data points. Note the rather poor approximation of
the function near the endpoints of the interval. As a consequence, high degree polynomial
interpolation tends not to be used in practical applications. Better alternatives rely on least
squares approximants by low degree polynomials, to be described next, and interpolation
by piecewise cubic splines, a topic that will be discussed in depth in Chapter 10.
If we have m > n + 1 data points, then, usually, there is no degree n polynomial that
fits all the data, and so one must switch over to a least squares approximation. The first
requirement is that the associated m (n + 1) interpolation matrix (4.43) has rank n + 1;
this follows from Lemma 4.14 provided at least n + 1 of the values t 1 , . . . , tm are distinct.
Thus, given data at m n + 1 different sample points tk , we can uniquely determine the
best least squares polynomial of degree n that fits the data by solving the normal equations
(4.36).
Example 4.20. If we use more than three data points, but still require a quadratic
polynomial, then we cannot interpolate exactly, and must use a least squares approximant.
Let us return to the problem of approximating the exponential function e t . For instance,
using five equally spaced sample points t1 = 0, t2 = .25, t3 = .5, t4 = .75, t5 = 1, the
coefficient matrix and sampled data vector (4.43) are

1
0
0
1
1 0.25 0.0625
1.28403

A = 1 0.5
0.25 ,
y = 1.64872 .

1 0.75 0.5625
2.117
1
1
1
2.71828
The solution to the normal equations

5.
2.5
K = AT A = 2.5
1.875
1.875 1.5625
3/7/03

(4.29), with

1.875
1.5625 ,
1.38281
144

8.76803
f = AT y = 5.4514 ,
4.40153
c 2003

Peter J. Olver

2.5

2.5

1.5

1.5

0.5

0.5

0.2

Figure 4.8.

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Quadratic Approximating Polynomial and Quartic Interpolating


Polynomial for et .

is
T

x = K 1 f = ( 1.00514, 0.864277, 0.843538 ) .


This leads to the modified approximating quadratic polynomial
p2 (t) = 1.00514 + 0.864277 t + 0.843538 t2 .
On the other hand, the quartic interpolating polynomial
p4 (t) = 0.069416 t4 + 0.140276 t3 + 0.509787 t2 + 0.998803 t + 1
is found directly from the data values as above. The quadratic polynomial has a maximal
error of 0.011 slightly better than the quadratic interpolant while the quartic has
a significantly smaller maximal error: 0.0000527. See Figure 4.8 for a comparison of the
graphs, and Example 4.23 below for further discussion.
Approximation and Interpolation by General Functions
There is nothing special about polynomial functions in the preceding approximation
scheme. For example, suppose we were interested in finding the best 2 -periodic trigonometric approximation
y = 1 cos t + 2 sin t
to a given set of data. Again, the least squares error takes the same form k y A x k 2 as
in (4.42), where
cos t
1
cos t2
A=
..

cos tm

3/7/03

sin t1
sin t2
..
.
sin tm

x=

145

1
,
2

y
1
y2
y=
..
.

ym

c 2003

Peter J. Olver

The key is that the unspecified parameters in this case 1 , 2 occur linearly in
the function. Thus, the most general case is to approximate the data (4.32) by a linear
combination
y(t) = 1 h1 (t) + 2 h2 (t) + + n hn (t),
of prescribed, linearly independent functions h1 (x), . . . , hn (x). The least squares error is,
as always, given by
v
u m
u X
2
yi y(ti )
= k y A x k,
Error = t
i=1

where the interpolation matrix

h1 (t1 ) h2 (t1 )

h1 (t2 ) h2 (t2 )
A=
..
..
.
.

and vector of unknown coefficients are

y

. . . hn (t1 )
1
1

y2
2
. . . hn (t2 )

,
y=
x=
..

..
.. .
.. ,
.
.
.

.
y

m
n
h1 (tm ) h2 (tm ) . . . hn (tm )

Thus, the columns of A are the sampled values of the functions. If A is square and
nonsingular, then we can find an interpolating function of the prescribed form by solving
the linear system
A x = y.
A particularly important case is provided by the trigonometric functions
1,

cos x,

sin x,

cos 2x,

sin 2x,

... .

Interpolation on equally spaced data points on the interval [ 0, 2 ] leads to the discrete
Fourier transform, of profound significance in signal processing, data transmission, and
compression. Trigonometric interpolation and the discrete Fourier transform will be the
focus of Section 12.1.
If there are more than n data points, then we cannot, in general, interpolate exactly,
and must content ourselves with a least squares approximation. The least squares solution
is found by solving the associated normal equations K x = f , where the (i, j) entry of
K = AT A is m times the average value of the product of hi (t) and hj (t), namely
m
1 X
kij = hi (t) hj (t) =
h (t ) h (t ),
m =1 i j

whereas the ith entry of f = AT y is m times the average


m
1 X
fi = hi (t) y =
h (t ) y .
m =1 i

The one key question is whether the columns of A are linearly independent; this is more
subtle than the polynomial case covered by Lemma 4.14, and requires the sampled function
3/7/03

146

c 2003

Peter J. Olver

vectors to be linearly independent, which in general is different than requiring the functions
themselves to be linearly independent. See Exercise for the distinction between these
two notions of linear independence.
If the parameters do not occur linearly in the functional formula, then we cannot use a
linear analysis to find the least squares solution. For example, a direct linear least squares
approach does not suffice to find the frequency , the amplitude r, and the phase of a
general trigonometric approximation:
y = c1 cos t + c2 sin t = r cos( t + ).
This constitutes a nonlinear minimization problem, and must be solved by the more sophisticated techniques presented in Section 18.3.
Weighted Least Squares
Another generalization is to introduce weights in the measurement of the least squares
error. Suppose some of the data is known to be more reliable or more significant than
others. For example, measurements at an earlier time may be more accurate, or more
critical to the data fitting problem, than measurements at later time. In that situation,
we should penalize any errors at the earlier times and downplay errors in the later data.
In general, this requires the introduction of a (positive) weight c i > 0 associated to
each data point (ti , yi ); the larger the weight, the more important the error. For a straight
line approximation y = + t, the weighted least squares error is defined as
v
v
u m
u m
X
uX
u
2
2
t
ci ei = t
ci yi ( + ti ) .
Error =
i=1

i=1

Let us rewrite this formula in matrix form. Let C = diag(c1 , . . . , cm ) denote the diagonal
weight matrix . Note that C > 0 is positive definite, since all the weights are positive. The
least squares error

Error = eT C e = k e k2
is the norm of the error vector e with respect to the weighted inner product
h v ; w i = vT C w

(4.51)

induced by the matrix C. Since e = y A x, we may use (4.24) to compute


k e k2 = k A x y k 2 = k A x k 2 2 h A x ; y i + k y k 2
= (A x)T C A x 2 (A x)T C y + yT C y
T

where

(4.52)
T

= x A C A x 2 x A C y + y C y = x K x 2 x f + c,

K = AT C A,

f = AT C y,

c = yT C y.

Note that K is the Gram matrix derived in (3.48), whose entries


kij = h vi ; vj i = viT C vj
3/7/03

147

c 2003

Peter J. Olver

are the weighted inner products between the column vectors v1 , . . . , vn of A. Theorem 3.33
immediately implies that K is positive definite provided A has linearly independent
columns or, equivalently, has rank n m.
Theorem 4.21. Suppose A is an m n matrix with linearly independent columns.
Suppose C > 0 is any positive definite m m matrix. Then, the quadratic function (4.52)
giving the weighted least squares error has a unique minimizer, which is the solution to
the weighted normal equations
AT C A x = AT C y,

x = (AT C A)1 AT C y.

so that

(4.53)

In other words, the weighted least squares solution to A x = y is obtained by multiplying both sides of the original system A x = b by the matrix AT C. The derivation of
this result allows C > 0 to be any positive definite matrix. Off-diagonal entries of C can
be used to weight cross-correlation terms in the data.
Example 4.22. In Example 4.12 we fit the data
ti

yi

12

ci

1
2

1
4

with an unweighted least squares line. Now we shall assign the weights for the error at
each sample point listed in the last row of the table, so that errors in the first two data
values carry more weight. To find the weighted least squares line y = + t that best fits
the data, we compute

1 0
!

3 0 0 0
23
5
1 1 1 1 0 2 0 0 1 1
4
,
AT C A =

31
1 3
0 1 3 6
0 0 12 0
5
2
0 0 0 14
1 6


2
!
1 01 0 0

37
1 1 1 1 0 2 0 0 3
T
2
A Cy =
.

= 69
0 1 3 6
0 0 31 0
7
2
0 0 0 14
12
Thus, the weighted normal equations (4.53) reduce to
23
4

+ 5 =

37
2 ,

5 +

31
2

69
2 ,

so

= 1.7817,

= 1.6511.

Therefore, the least squares fit to the data under the given weights is y = 1.7817 t + 1.6511 .
Least Squares Approximation in Function Spaces
So far, while we have used least squares minimization to interpolate and approximate
known, complicated functions by simpler polynomials, we have only worried about the
3/7/03

148

c 2003

Peter J. Olver

errors committed at a discrete, preassigned set of sample points. A more uniform approach
would be to take into account the errors committed at all points in the interval of interest.
This can be accomplished by replacing the discrete, and finite-dimensional, vector space
norm on sample vectors by a continuous, infinite-dimensional function space norm in order
to specify the least squares error that must be minimized over the entire interval.
More specifically, we let V = C0 [ a, b ] denote the space of continuous functions on the
bounded interval [ a, b ] with L2 inner product
hf ;gi =

f (t) g(t) dt.

(4.54)

Let P (n) denote the subspace consisting of all polynomials of degree n. For simplicity,
we employ the standard monomial basis 1, t, t2 , . . . , tn . We will be approximating a general
function f (t) C0 [ a, b ] by a polynomial
p(t) = 1 + 2 t + + n+1 tn P (n)

(4.55)

of degree at most n. The error function e(t) = f (t)p(t) measures the discrepancy between
the function and its approximating polynomial at each t. Instead of summing the squares
of the errors at a finite set of sample points, we go to a continuous limit that integrates
the squared errors of all points in the interval. Thus, the approximating polynomial will
be characterized as the one that minimizes the total L2 least squares error
s
Z b
Error = k e k = k p f k =
[ p(t) f (t) ]2 dt .
(4.56)
a

To solve the minimization problem, we begin by substituting (4.55) and expanding,


as in (4.20):
n+1

n+1
X
2 n+1
X
X

2
i1
i1 j
kp f k =
i t
f (t) =
i j h t
; t i2
i h ti1 ; f (t) i+k f (t) k2 .

i=1

i,j = 1

i=1

As a result, we are led to minmize the same kind of quadratic function


xT K x 2 xT f + c,

(4.57)

T
where x = 1 , 2 , . . . , n+1
is the vector containing the unknown coefficients in the
minimizing polynomial, while
kij = h t

i1

;t

j1

i=

i+j2

dt,

fi = h t

i1

;f i =

ti1 f (t) dt,

(4.58)

are, as before, the Gram matrix K consisting of inner products between basis functions
along with the vector f of inner products between the right hand side and the basis functions. The coefficients of the least squares minimizing polynomial are thus found by solving
the associated normal equations K x = f .
3/7/03

149

c 2003

Peter J. Olver

Example 4.23. Let us return to the problem of approximating the exponential


function f (t) = et on the interval 0 t 1. We consider the subspace P (2) consisting of
all quadratic polynomials
p(t) = + t + t2 .
Using the monomial basis 1, t, t2 , the normal equations (4.29) are

1 12 13

e1
1 1 1

2 3 4 = 1 .
1
3

1
4

1
5

e2

The coefficient matrix is the Gram matrix K consisting of the inner products
Z 1
1
i j
ti+j dt =
ht ;t i =
i+j+1
0
between basis monomials, while the right hand side is the vector of inner products
Z 1
t i
he ;t i =
ti et dt.
0

The solution is computed to be


= 39 e 105 ' 1.012991,

= 216 e + 588 ' .851125,

leading to the least squares quadratic approximant

= 210 e 570 ' .839184,

p? (t) = 1.012991 + .851125 t + .839184 t2 .

(4.59)

The least squares error is


k et p? (t) k ' .00527593.

The maximal error is measured by the L norm of the difference,

k et p? (t) k = max et p? (t) 0 t 1 ' .014981815,

with the maximum occurring at t = 1. Thus, the simple quadratic polynomial (4.59) will
give a reasonable approximation to the first two decimal places in e t on the entire interval
[ 0, 1 ]. A more accurate approximation can be made by taking a higher degree polynomial,
or by decreasing the length of the interval.

Remark : Although the least squares polynomial (4.59) minimizes the L 2 norm of
the error, it does slightly worse with the L norm than the previous smaple minimizer
(4.45). The problem of finding the quadratic polynomial that minimizes the L norm
is more difficult, and must be solved by nonlinear minimization methods, as discussed in
Section 18.3.
Remark : As noted in Example 3.35, the Gram matrix for the simple monomial basis
is the nn Hilbert matrix (1.72). The ill conditioned nature of the Hilbert matrix, and the
consequential difficulty in accurately solving the normal equations, complicates the practical numerical implementation of high degree least squares polynomial approximations. A
better approach, based on an alternative orthogonal polynomial basis, will be discussed in
in the following Chapter 5.
3/7/03

150

c 2003

Peter J. Olver

Chapter 5
Orthogonality
Orthogonality is the mathematical formalization of the geometrical property of perpendicularity of vectors carried over to general inner product spaces. Orthogonality is
the key to all of Fourier analysis and its manifold applications, both classical and modern.
Many computations become dramatically simpler and less prone to round off and other
errors when performed in orthogonal systems. Indeed, large scale modern applications in
signal processing, computer vision, numerical solutions of differential equations, etc., would
be impractical, if not completely infeasible were it not for a suitable orthogonal basis.
Bases of inner product spaces that consist of mutually orthogonal elements play a
fundamental role in applications and numerical algorithms. The famous GramSchmidt
algorithm will convert an arbitrary basis of an inner product space into an orthogonal basis.
As such, it forms one of the key algorithms in linear analysis, in both finite-dimensional
vector spaces and also function space. Classical orthogonal polynomials and trigonometric bases, underlying all of Fourier analysis, and many applications to partial differential
equations, arise in this general manner. The GramSchmidt process can be re-interpreted
as a new kind of matrix factorization, in which a nonsingular matrix A = Q R is written as
the product of an orthogonal matrix Q and an upper triangular matrix R. The Q R factorization has several important applications, including one of the basic numerical techniques
for computing eigenvalues.
Orthogonality is motivated by geometry, and the methods have significant geometrical
consequences. Orthogonal matrices play a crucial role in the geometry of Euclidean space.
Applications to computer graphics, animation, and three-dimensional image analysis will
be presented in Chapter 7. We can reinterpret the closest point or least squares minimizer in terms of orthogonal projection onto a subspace. Indeed, the normal equations
underlying least squares data estimation and minimization of quadratic functions become
extraordinarily simpler when written in terms of an orthonormal basis for the subspace.
An important fact is that the fundamental subspaces of a matrix come in orthogonal pairs;
this provides an important new characterization of the compatibility conditions for linear
systems.

5.1. Orthogonal Bases.


Let V be a fixed real inner product space. Recall that two elements v, w V are
called orthogonal if their inner product vanishes: h v ; w i = 0. In the case of vectors in

The methods carry over more or less straightforwardly to complex inner product spaces. The
main complication is to be careful with the order of vectors since complex inner products are not

3/7/03

151

c 2003

Peter J. Olver

Euclidean space, this means that they meet at a right angle. A particularly important
configuration is when V admits a basis consisting of mutually orthogonal elements.
Definition 5.1. A basis u1 , . . . , un of V is called orthogonal if h ui ; uj i = 0 for
all i 6= j. The basis is called orthonormal if, in addition, each vector has unit length:
k ui k = 1, for all i = 1, . . . , n.
For the Euclidean space R n equipped with the standard dot product, the simplest
example of an orthonormal basis is the standard basis



0
0
1
0
1
0



0
0
0

...
en =
e2 = .. ,
e1 = .. ,
..
.

.
.
.
0
0
0
0

Orthogonality follows because ei ej = 0, for i 6= j, while k ei k = 1 implies normality.


Since a basis cannot contain the zero vector, there is an easy way to convert an
orthogonal basis to an orthonormal basis. Namely, one replaces each basis vector by a unit
vector pointing in the same direction, using the method in Lemma 3.17.

Lemma 5.2. If v1 , . . . , vn is any orthogonal basis, then the normalized vectors


ui = vi /k vi k form an orthonormal basis.
Example 5.3. The vectors

1
v1 = 2 ,
1

v 2 = 1 ,
2

5
v 3 = 2 ,
1

are mutually perpendicular, vi vj = 0 for i 6= j, and so form an orthogonal basis of


R 3 with respect to the standard dot product. To construct an orthonormal basis using
Lemma 5.2, we divide each one by its length, so

0
1
0
5
26
1
302
1
1
1
, u = 1 = , u = 2 = .

u1 =
2 =
2
3

6
30
6 1
5 2
30
1
2
1
1

6
5
30

The fact that the elements of an orthonormal basis involve square roots is fairly typical.

A useful observation is that an orthogonal set of non-zero vectors is automatically


linearly independent.

symmetric; see Section 3.6 for details. In this chapter, we will write all formulas in the correct
order so that they remain valid in the complex category.

3/7/03

152

c 2003

Peter J. Olver

Proposition 5.4. If v1 , . . . , vk V are non-zero and mutually orthogonal, then


they are linearly independent.
Proof : Suppose
c1 v1 + + ck vk = 0.
Let us take the inner product of the equation with any vi . Using linearity of the inner
product and the orthogonality of the elements,
0 = h c 1 v1 + + c k vk ; v i i = c 1 h v1 ; v i i + + c k h vk ; v i i = c i h vi ; v i i = c i k v i k2 .
Therefore, provided vi 6= 0, we conclude that ci = 0 for each i = 1, . . . , k. This proves
linear independence.
Q.E.D.
As a direct corollary, we infer that any orthogonal spanning set is automatically a
basis for its span.
Theorem 5.5. Suppose v1 , . . . , vn V are mutually orthogonal nonzero elements
that span a vector space V . Then they form an orthogonal basis of V , and so dim V = n.
The concept of orthogonality applies equally well to functions.
Example 5.6. Consider the vector space P (2) consisting of all quadratic polynomials
p(x) = + x + x2 ,
equipped with the L2 inner product
hp;qi =

p(x) q(x) dx.


0

The standard monomial basis 1, x, x2 is not an orthogonal basis. Indeed,


h1;xi =

1
2

h 1 ; x2 i =

1
3

h x ; x2 i =

1
4

One orthogonal basis of P (2) is provided by following polynomials:


p1 (x) = 1,

p2 (x) = x 12 ,

p3 (x) = x2 x + 61 .

(5.1)

Indeed, one easily verifies that h pi ; pj i = 0 for i 6= j, while


k p1 k = 1,

1
1
k p2 k = = ,
12
2 3

k p3 k =

1
1
= .
180
6 5

(5.2)

The corresponding orthonormal basis is found by dividing each basis element by its norm:

(5.3)
u1 (x) = 1,
u2 (x) = 3 ( 2 x 1 ) ,
u3 (x) = 5 6 x2 6 x + 1 ,
The result is a basis of orthonormal quadratic polynomials.

Remark : In Section 5.4 below, we will learn how to construct such orthogonal systems
of polynomials from scratch.
3/7/03

153

c 2003

Peter J. Olver

Does every finite-dimensional inner product space admit an orthonormal basis? The
answer is yes, and there is an explicit construction, known as the GramSchmidt process,
that constructs one.
Theorem 5.7. If V is a finite-dimensional inner product space, then there exists an
orthonormal basis of V .
We shall provide the details of the construction used to prove Theorem 5.7 in Section 5.2. Indeed, the construction will show us that any vector space of dimension > 1
admits many different orthogonal and hence orthonormal bases.
Computations in Orthogonal Bases
What are the advantages of orthogonal and orthonormal bases? As we shall soon
discover, it is very easy to write a vector as a linear combination of orthogonal basis
vectors. Referring back to (2.23), we saw that, to express a vector in terms of a new
basis, one is required to solve a system of linear equations for the new coordinates. If
the dimension of the vector space is large, then this may require a considerable amount
of effort. But, if the basis is orthogonal, or, even better, orthonormal, then the change of
basis computation requires almost no work. The basic idea is contained in the following
theorem:
Theorem 5.8. Let u1 , . . . , un be an orthonormal basis for an inner product space
V . Then one can write the general element v V in terms of the basis as
v = c 1 u1 + + c n un ,

(5.4)

where the coordinates of v with respect to the orthonormal basis are given as inner products
ci = h v ; ui i,

i = 1, . . . , n.

(5.5)

v
u n
uX
t
h v ; ui i 2

(5.6)

Moreover, the norm of the vector


v
u n
uX 2
ci =
kvk = t
i=1

i=1

is computed by summing the squares of its coordinates.

Proof : Let us compute the inner product of (5.4) with one of the basis vectors. Using
the orthonormality conditions

0
i 6= j,
h ui ; u j i =
1
i = j,
and bilinearity of the inner product, we find
* n
+
n
X
X
h v ; ui i =
c j uj ; u i =
c j h uj ; u i i = c i k u i k 2 = c i .
j =1

3/7/03

j =1

154

c 2003

Peter J. Olver

To prove formula (5.6), we similarly expand


2

kvk = hv;vi =

n
X

i,j = 1

c i c j h ui ; u j i =

n
X

c2i ,

i=1

again using the orthonormality of the basis elements.

Q.E.D.
T

Example 5.9. Let us rewrite the vector v = ( 1, 1, 1 ) in terms of the orthonormal


basis

0
1
30

26
2


u2 =
u3 =
u1 =
5 ,
30 ,
6 ,
2
1
16
5
30

constructed in Example 5.3. Computing the dot products


h v ; u1 i =

2
6

h v ; u2 i =

3
5

h v ; u3 i =

4
30

we conclude that
v=

2
6

u1 +

3
5

u2 +

4
30

u3 ,

as the reader can validate. Needless to say, a direct computation based on solving the
associated linear system, as in Chapter 2, is considerably more tedious.
While it is straightforward to pass from an orthogonal basis to its orthonormal sibling,
we shall often find it more convenient to work directly with the former. The next result
provides the corresponding formula expressing a vector in terms of an orthogonal, but not
necessarily orthonormal basis. The proof proceeds exactly as in the orthonormal case, and
details are left to the reader to fill in.
Theorem 5.10. If v1 , . . . , vn form an orthogonal basis, then the coordinates of a
vector
h v ; vi i
.
(5.7)
v = a 1 v1 + + a n vn
are given by
ai =
k v i k2
In this case, the norm of a vector can be computed via the formula
2
n
n
X
X
h v ; vi i
2
2
2
.
ai k v i k =
kvk =
k
v
k
i
i=1
i=1

(5.8)

Equation (5.7), along with its orthonormal simplification (5.5), is one of the most
important formulas in this text. In particular, all of Fourier analysis, including signal
processing, is founded on particular function space specializations of these formulae.
Example 5.11. The wavelet basis

1
1
1
1
v1 = ,
v2 =
,
1
1
1
1
3/7/03

155

1
1
v3 =
,
0
0

0
0
v4 =
,
1
1
c 2003

(5.9)

Peter J. Olver

introduced in Example 2.34 is, in fact, an orthogonal basis of R 4 . The norms are

k v4 k = 2.
k v1 k = 2,
k v2 k = 2,
k v3 k = 2,
Therefore, using (5.7), we can readily express any vector as a linear combination of the
wavelet basis vectors. For example,

4
2
v=
= 2 v 1 v2 + 3 v 3 2 v 4 ,
1
5
where the wavelet basis coordinates are computed as
8
h v ; v1 i
= = 2,
2
k v1 k
4

h v ; v2 i
4
=
= 1,
2
k v2 k
4

h v ; v3 i
6
= =3
2
k v3 k
2

h v ; v4 i
4
=
= 2 .
2
k v4 k
2

This is clearly a lot quicker than solving the linear system as we did in Example 2.34.
Example 5.12. The same formulae apply to orthogonal bases in function spaces.
For example, to express a quadratic polynomial

p(x) = c1 p1 (x) + c2 p2 (x) + c3 p3 (x) = c1 + c2 x 12 + c3 x2 x + 16


in terms of the orthogonal basis (5.1), we merely compute the inner product integrals
Z 1
Z 1

h p ; p2 i
h p ; p1 i
p(x) dx,
c2 =
p(x) x 21 dx,
c1 =
=
= 12
2
2
k p1 k
k p2 k
0
0
Z 1

h p ; p2 i
= 180
c3 =
p(x) x2 x + 16 dx.
2
k p2 k
0
Thus, for example,

p(x) = x2 + x + 1 =

11
6

+2 x

1
2

+ x2 x +

1
6

where the coefficients can either be produced directly, or by the integration formulae.
Example 5.13. Perhaps the most important example of an orthogonal basis is
provided by the basic trigonometric functions. Let T (n) denote the vector space consisting
of all trigonometric polynomials
X
T (x) =
ajk (sin x)j (cos x)k
0j+kn

of degree n. While the monomials (sin x)j (cos x)k span T (n) , they do not form a basis
owing to identities stemming from the basic relation cos2 x + sin2 x = 1; see Example 2.20.
According to Exercise , a more convenient spanning set is provided by 2 n + 1 the elementary the trigonometric functions
1,
3/7/03

cos x,

sin x,

cos 2 x,

sin 2 x,
156

...

cos n x,

sin n x,
c 2003

(5.10)

Peter J. Olver

form an orthogonal basis of T (n) , which, as a consequence, has dimension 2 n + 1.


To establish this important fact, we adopt a rescaled version of the L 2 inner product
and norm:
Z
Z
1
1
2
hf ;gi =
f (x) g(x) dx,
kf k =
f (x)2 dx.
(5.11)


The elementary integration formulae

k 6= l,
Z
0,
cos k x cos l x dx =
2, k = l = 0,

, k = l 6= 0,

sin k x sin l x dx =

0,

k 6= l,

, k = l 6= 0,

cos k x sin l x dx = 0,

(5.12)

which are valid for all nonnegative integers k, l 0, imply the orthogonality relations
h cos k x ; cos l x i = h sin k x ; sin l x i = 0,
k cos k x k = k sin k x k = 1,

k 6= 0,

k 6= l,

h cos k x ; sin l x i = 0,

k1k = 2.

(5.13)

Theorem 5.5 assures us that the trigonometric functions (5.10) form a basis for T (n)
a fact that is not so easy to establish directly. The only constituent which is not a unit
function is, ironically, the constant function 1. Replacing 1 by 12 in (5.10) would give us
an orthonormal basis for T (n) with respect to the rescaled inner product
(5.11). However,
in practice this is not done in order to avoid the extraneous factor of 2 .
Orthogonality of the trigonometric functions (5.10) means that we can compute the
coefficients a0 , . . . , an , b1 , . . . , bn of any trigonometric polynomial
p(x) = a0 +

n
X

[ ak cos k x + bk sin k x ]

(5.14)

k=1

by an explicit integration formula. Namely,

1
hf ;1i
=
a0 =
k 1 k2
2

f (x) dx,

1
ak = h f ; cos kx i =

1
bk = h f ; sin kx i =

f (x) cos kx dx,

f (x) sin kx dx,

k 1.

(5.15)
These formulae will play an essential role in the theory and applications of Fourier series;
see Chapter 11.

5.2. The GramSchmidt Process.


Once one becomes convinced of the utility of orthogonal and orthonormal bases, the
natural question follows: How can we construct them? The basic algorithm was first
discovered by PierreSimon Laplace in the eighteenth century. Today is is known as the
3/7/03

157

c 2003

Peter J. Olver

GramSchmidt process, after its rediscovery by Jorgen Gram, who we already met in
Chapter 3, and Erhard Schmidt.
Let V denote a finite-dimensional inner product space. (To begin with, the reader
can assume V us a subspace of R n with the standard Euclidean dot product, although the
algorithm will be formulated in complete generality.) We assume that we already know
some basis w1 , . . . , wn of V , which has dimension n, and wish to use this information to
construct an orthogonal basis v1 , . . . , vn .
We will determine the vectors in the orthogonal basis in order. The first basis element
v1 can be any non-zero element of V , and so there is no harm in choosing
v1 = w 1 .

(5.16)

The second basis vector must be orthogonal to the first: h v2 ; v1 i = 0. Let us try subtracting a suitable multiple of the first basis vector from the second,
v2 = w 2 c v 1 ,

(5.17)

in order to arrange orthogonality. We compute


h v2 ; v1 i = h w2 ; v1 i c h v1 ; v1 i = h w2 ; v1 i c k v1 k2 = 0.
This requires
c=

h w2 ; v 1 i
,
k v 1 k2

and therefore

v 2 = w2

h w2 ; v 1 i
v1 .
k v 1 k2

The linear independence of v1 = w1 and w2 ensures that v2 6= 0.


Next, we construct
v3 = w 3 c 1 v1 c 2 v2

(5.18)

by subtracting suitable multiples of the first two orthogonal basis elements from w 3 . We
want v3 to be orthogonal to both v1 and v2 . Since we already arranged that h v1 ; v2 i = 0,
this requires
0 = h v3 ; v1 i = h w3 ; v1 i c1 h v1 ; v1 i,

0 = h v3 ; v2 i = h w3 ; v2 i c2 h v2 ; v2 i,

and hence,
c1 =

h w3 ; v 1 i
,
k v 1 k2

c2 =

h w3 ; v 2 i
.
k v 2 k2

Therefore, the next orthogonal basis vector is given by the formula


v3 = w 3

zero.

h w3 ; v 2 i
h w3 ; v 1 i
v1
v2 .
2
k v1 k
k v 2 k2

Note that v1 6= 0 since v1 = w1 appears in the original basis, and no basis element can be

3/7/03

158

c 2003

Peter J. Olver

Continuing in the same manner, suppose we have already constructed the mutually
orthogonal vectors v1 , . . . , vk1 as linear combinations of w1 , . . . , wk1 . The next orthogonal basis element vk will be obtained from wk by a formula of the form
vk = wk c1 v1 ck1 vk1 .
Since v1 , . . . , vk1 are already orthogonal, the orthogonality constraint
0 = h v k ; v j i = h w k ; v j i c j h vj ; v j i
requires
cj =

h wk ; v j i
k v j k2

for

j = 1, . . . , k 1.

(5.19)

In this fashion, we produce the general GramSchmidt formula


vk = w k

k1
X

j =1

h wk ; v j i
vj ,
k v j k2

k = 1, . . . , n,

(5.20)

which serves as a recursive procedure for constructing the mutually orthogonal basis vectors
v1 , . . . , vn . A simple induction demonstrates that
vk = ck1 w1 + + ck,k1 wk1 + wk

(5.21)

is a certain linear combination of the original basis vectors. In particular, v k 6= 0, since


otherwise w1 , . . . , wk would be linearly dependent, and hence not a basis as we originally
assumed. Proposition 5.4 will now guarantee that the orthogonal vectors v 1 , . . . , vn are
linearly independent and hence form an orthogonal basis for V . If we really want an
orthonormal basis u1 , . . . , un , we merely normalize the orthogonal basis vectors v1 , . . . , vn ,
as in Lemma 5.2, and set uk = vk /k vk k for k = 1, . . . , n.
Example 5.14. The vectors

1
w1 = 1 ,
1


1
w2 = 0 ,
2

2
w3 = 2 ,
3

(5.22)

are readily seen to form a basis of R 3 . To construct an orthogonal basis (with respect to
the standard dot product) using the GramSchmidt algorithm, we begin by setting

1
v1 = w 1 = 1 .
1

This will, in fact, be a consequence of the successful completion of the GramSchmidt algorithm and does not need to be checked in advance. If the given vectors were not linearly
independent, then eventually one of the GramSchmidt vectors would vanish, and the process will
break down.

3/7/03

159

c 2003

Peter J. Olver

The next basis element is


4
1
1
3
1
w v

v2 = w2 2 21 v1 = 0
1 = 13 .
k v1 k
3
2
1
5
3

The last element of our orthogonal basis is

1
2
1
w v
w v
7 3

v3 = w3 3 21 v1 3 22 v2 = 2
1 14 13 = 32 .
k v1 k
k v2 k
3
3
5
3
1
12
3

The reader can verify the orthogonality of v1 , v2 , v3 .


An orthonormal basis is obtained by dividing each vector by its length. Since
q
q

3
,
k v3 k = 72 .
k v2 k = 14
k v1 k = 3,

we produce

u1 =

1
13

3
13

u2 =

4
42
1

42
5

42

as the corresponding orthonormal basis vectors.

u3 =

2
14
3

14
1
14

(5.23)

Example 5.15. Here is a typical sort of problem: Using the standard dot product,
find an orthonormal basis for the subspace V R 4 consisting of all vectors which are
T
T
orthogonal to the vector a = ( 1, 2, 1, 3 ) . Now, a vector x = ( x1 , x2 , x3 , x4 ) is
orthogonal to a if and only if
x a = x1 + 2 x2 x3 3 x4 = 0.
Solving this linear system, the free variables are x2 , x3 , x4 , and so a (non-orthogonal) basis
for the subspace is

2
1
w1 =
,
0
0


1
0
w 2 = ,
1
0


3
0
w 3 = .
0
1

To obtain an orthogonal basis, we apply the GramSchmidt algorithm. First,

2
1
v1 = w 1 =
.
0
0
3/7/03

160

c 2003

Peter J. Olver

The next element is


1


5
2
1

w2 v 1
0 2 1 25
v2 = w 2
v =

= .
0
1
k v 1 k2 1
5
1
0
0
0

The last element of our orthogonal basis is

3
2

6
w3 v 2
w3 v 1
0
1
v
v =
v3 = w 3

k v 1 k2 1
k v 2 k2 2 0
5 0
1

3
5
6
5

1
5
2
5

1
2

1
= 1 .
1
2

An orthonormal basis can then be obtained by dividing each vi by its length:

1
2

30
10
2
1 5
2

30

5
u2 =
,
u3 = 10 .
u1 =
,

1

530
0

10
2

0
0
10

(5.24)

Incidentally, the GramSchmidt procedure provides a constructive proof of the existence of orthogonal and orthonormal bases of finite-dimensional vector spaces, which was
the content of Theorem 5.7. In Section 5.4 we will apply the GramSchmidt process, in
exactly the same manner, to construct orthogonal bases and orthonormal bases of finitedimensional subspaces of function space.
At the k th stage of the procedure, the elements v1 , . . . , vk form an orthogonal basis,
and their normalized counterparts u1 , . . . , uk an orthonormal basis, of the subspace
Vk = span {w1 , . . . , wk } = span {v1 , . . . , vk } = span {u1 , . . . , uk }

(5.25)

spanned by the original first k basis elements.


The reader may have already noticed the similarity between the GramSchmidt coefficients in (5.20) and the formulae (5.7) for the coefficients of a vector with respect to
an orthogonal basis. Indeed, one can interpret the vector subtracted from w k in (5.20)
as if wk were in the subspace Vk1 spanned by the orthogonal basis vectors v1 , . . . , vk1 .
However, wk is not in that subspace, since otherwise w1 , . . . , wk would not be linearly
independent. This observation, and the connections with least squares minimization, will
be properly developed in Section 5.5.
Remark : We have formulated the GramSchmidt in such a manner that the formulae
apply equally well to general inner products and to complex inner product spaces. Since a
Hermitian inner product is conjugate symmetric, the one tricky point is to make sure that
the order of the vectors in each inner product is correct.
3/7/03

161

c 2003

Peter J. Olver

A Modified GramSchmidt Process


Let us reformulate the GramSchmidt method in a form that directly produces the
orthogonal basis vectors u1 , . . . , un from the basis w1 , . . . , wn . According to (5.25), we
can write the original basis in terms of the orthonormal basis in the triangular form
w1 = r11 u1 ,
w2 = r12 u1 + r22 u2 ,
w3 = r13 u1 + r23 u2 + r33 u3 ,
..
..
..
..
.
.
.
.
wn = r1n u1 + r2n u2 + + rnn un .

(5.26)

The coefficients rij can be computed by taking inner products with one of the orthonormal
basis vectors ui and using the orthonormality constraints. Thus,
rij = h wj ; ui i.

(5.27)

On the other hand, according to (5.6),


2
2
2
.
+ rjj
+ + rj1,j
k wj k2 = r1j

(5.28)

The pair of equations (5.27), (5.28) can be used to devise a recursive procedure to compute
the orthonormal basis. At stage j we have already computed u1 , . . . , uj1 . We then
compute
rij = h wj ; ui i,
for each
i = 1, . . . , j 1.
(5.29)
We then obtain the next orthonormal basis vector uj by the formulae
vj = wj r1j u1 rj1,j uj1 ,

rjj = k vj k,

uj =

vj
.
rjj

(5.30)

Example 5.16. Let us apply the algorithm to the vectors

1
1
2
w1 = 1 ,
w2 = 0 ,
w3 = 2 ,
1
2
3

of Example 5.14. The first step is to set


v1 = w 1 ,

r11 = k v1 k = k w1 k =

3,

and so

v
u1 = 1 =
r11

1
1
1
, ,
3
3
3

Next, we compute
r12 = h w2 ; u1 i = 13 ,

v2 = w2 r12 u1 =

1 5
3, 3, 3

For j = 1, there is nothing to do.

3/7/03

162

c 2003

Peter J. Olver

Furthermore,
r22 = k v2 k =
The final step uses

3
14 ,

and so

v
u2 = 2 =
r22

1
5
4
, ,
42
42
42

q
r23 = h w3 ; u2 i = 21
2 ,

T
v3 = w3 r13 u1 r23 u2 = 1, 32 , 12
,
T

q
v3
3
1
2
7
and so
u3 =
.
r33 = k v3 k = 2 ,
= , ,
r33
14
14
14
Note that we end up with precisely the same orthonormal basis vectors as in Example 5.14.

r13 = h w3 ; u1 i = 3,

For hand computations, the orthogonal version of the GramSchmidt algorithm is


easier even if one does ultimately want an orthonormal basis since it avoids the
square roots that are ubiquitous in the orthonormal version (5.29), (5.30). On the other
hand, for numerical implementation on a computer, the orthonormal version is a bit faster,
as it involves fewer arithmetic operations.
For practical, large scale computations, however, the preceding versions of the Gram
Schmidt algorithm both suffer from a serious flaw. The method is numerically unstable,
and numerical errors, e.g., round-off, may cause it to produce non-orthogonal vectors.
Fortunately, there is a simple rearrangement of the calculation that obviates this difficulty
and leads to a numerically robust algorithm.
The idea is, instead of working with the vectors in order, modifying w j to ensure
orthogonality all at once, we work gradually, first subtracting off orthogonal projections
to subspaces as they arise. More specifically, the first step of the algorithm is the same
we set u1 = w1 /k w1 k. We then subtracting off the appropriate multiples of u1 from each
of the remaining basis vectors to ensure their orthogonality to u1 . In other words, we let
(2)

wi

= w i h w i ; u 1 i u1 ,

for

i = 2, . . . , n,

(2)

which are all orthogonal to u1 . We can normalize w2 to produce the second orthonormal
(2)
(2)
basis vector u2 = w2 /k w2 k. The next step is to subtract off the appropriate multiples
of u2 from the remaining modified vectors
(3)

wi

(2)

= wi

(2)

h w i ; u 2 i u2 ,

k = 3, . . . , n,
(3)

so that they are orthogonal to u2 . Note that each wi is nonzero as otherwise the
original basis would fail to be linearly independent and also orthogonal to u 1 since both
of its summands are by the previous step. The next orthonormal basis element is then
(3)
(3)
u3 = w3 /k w3 k. One then continues on in this manner. The full algorithm starts with
(1)
the initial basis vectors wj = wj , and the general step is
(j)

uj =

3/7/03

wj

(j)

k wj k

(j)

(j)

(j)

w k = w k h w k ; u j i uj ,
163

j = 2, . . . n,
k = j, . . . , n.
c 2003

(5.31)

Peter J. Olver

Example 5.17. Let us apply the stable GramSchmidt process (5.31) to the basis
vectors

1
0
2
w 3 = 2 .
w 2 = 4 ,
w1 = 2 ,
3
1
1
(1)

(1)

(1)

We set w1 = w1 , w2 = w2 , w3 = w3 . The first orthonormal basis vector is


2
(1)

u1 =

w1

(1)
k w1

Next, we compute
(2)

w2

2
(1)
(1)
= w 2 h w 2 ; u 1 i u1 = 2 ,
0

(2)

w3

The second orthonormal basis vector is

(2)

u2 =

w2

(2)

k w2 k

Finally

(3)
w3

(2)
w3

(2)
h w3

; u 2 i u2 =

3
2
3
13

1
2
1
2

1
(1)
(1)
= w 3 h w 3 ; u 1 i u1 = 0 .
2

1
2
1

(3)

u3 =

w3

(3)

k w3 k

The resulting vectors u1 , u2 , u3 form the desired orthonormal basis.

1
6
1
6
4

5.3. Orthogonal Matrices.


Matrices whose columns form an orthonormal basis of R n relative to the Euclidean
inner product play a special role in geometry and applications. orthogonal matrices appear in a wide range of applications in physics, quantum mechanics, partial differential
equations, symmetry theory, and special functions. Rotational motions of bodies in threedimensional space are described by orthogonal matrices, and hence they lie at the foundations of rigid body mechanics, including satellite and underwater vehicle motions, as well
as three-dimensional computer graphics and animation. Furthermore, orthogonal matrices
underly one the most important methods of numerical linear algebra, the Q R algorithm
for computing eigenvalues of positive definite matrices, to be discussed in Section 9.6.
Definition 5.18. A square matrix Q is called an orthogonal matrix if it satisfies
QT Q = I .
3/7/03

164

(5.32)
c 2003

Peter J. Olver

The orthogonality condition implies that one can easily invert an orthogonal matrix:
Q1 = QT .

(5.33)

In fact the two conditions are equivalent, and hence a matrix is orthogonal if and only if
its inverse is equal to its transpose. The second important characterization of orthogonal
matrices relates them directly to orthonormal bases.
Proposition 5.19. A matrix Q is orthogonal if and only if its columns form an
orthonormal basis with respect to the Euclidean dot product on R n .
T

Proof : Let u1 , . . . , un be the columns of Q = ( u1 u2 . . . un ) . Then uT1 , . . . , uTn are


the rows of the transposed matrix QT . The (i, j)th entry of the product QT Q = I equals
the dot product

1,
i = j,
T
ui uj = u i u j =
(5.34)
0,
i 6= j,
which is precisely the condition for u1 , . . . , un to form an orthonormal basis.

Q.E.D.

So, technically, we should be referring to an orthonormal matrix, not an orthogonal matrix. But the terminology is so standard that we are forced to adopt it here.

a b
Example 5.20. A 2 2 matrix Q =
is orthogonal if and only if its columns
c d


b
a
form an orthonormal basis of R 2 . Equivalently, the reqauirement
, u2 =
u1 =
d
c

a c
a b
a + c2 a c + b d
1 0
T
Q Q=
=
=
,
d d
c d
a c + b d b 2 + d2
0 1
implies that its entries must satisfy the algebraic equations
a2 + c2 = 1,

a c + b d = 0,
T

b2 + d2 = 1.
T

The first and last equations say the points ( a, c ) and ( b, d ) lie on the unit circle k u k =
1, and so
a = cos ,
c = sin ,
b = cos ,
d = sin ,
for some choice of angles , . The remaining orthogonality condition is
0 = cos cos + sin sin = cos( ),

and hence = 21 differ by a right angle. The sign leads to two cases:
b = sin ,

d = cos ,

or

b = sin ,

d = cos .

As a result, every 2 2 orthogonal matrix has one of two possible forms

cos sin
cos
sin
or
,
where
0 < 2 .
sin cos
sin cos

(5.35)

The corresponding orthonormal bases are illustrated in Figure 5.1.


3/7/03

165

c 2003

Peter J. Olver

u2
u1

u1

u2

Orthonormal Bases in R 2 .

Figure 5.1.

Lemma 5.21. An orthogonal matrix has determinant det Q = 1.


Proof : Taking the determinant of (5.32) gives

1 = det I = det(QT Q) = det QT det Q = (det Q)2 ,


which immediately proves the lemma.

Q.E.D.

An orthogonal matrix is called proper if it has determinant + 1. Geometrically, proper


orthogonal matrices correspond to rotations of R n , while improper orthogonal matrices give
reflections; see Section 7.4 for details. For instance, in the 22 case (5.35), the first matrix
has determinant + 1 and represents a proper rotation through angle , while the second
has determinant 1 and so represents a reflection.
T

Example 5.22. A 3 3 orthogonal matrix Q = ( u1 u2 u3 ) is prescribed by 3


mutually perpendicular vectors of unit length in R 3 . For instance, the orthonormal basis
constructed in (5.23) corresponds to the orthogonal matrix
1

4
2

Q=

3
1

13

42
1

42
5
42

14
314
114

The orthogonal matrix is proper, det Q = + 1 if and only if the basis vectors form a right
handed system, and improper if they form a left handed system. Right handed orthonormal
bases, such as the one represented by the columns of this particular orthogonal matrix, are
obtained from the standard orthonormal basis e1 , e2 , e3 by a rotation the rotation the
orthogonal matrix represents while left handed bases live in the mirror image world,
and are obtained by reflecting a right handed basis.
Proposition 5.23. The product of two orthogonal matrices is also orthogonal.
Proof : If QT1 Q1 = I = QT2 Q2 , then
(Q1 Q2 )T (Q1 Q2 ) = QT2 QT1 Q1 Q2 = QT2 Q2 = I ,
and so Q1 Q2 is also orthogonal.
3/7/03

Q.E.D.
166

c 2003

Peter J. Olver

This property says that the set of all orthogonal matrices forms a group , known as
the orthogonal group. The orthogonal group forms the foundation of Euclidean geometry.
It plays a key role in rigid body mechanics, as well as computer graphics and animation,
where analyzing the rotations and reflections of three-dimensional objects is of essential
importance.
The Q R Factorization
The GramSchmidt algorithm for vectors in R n can be reinterpreted as a method
of factorizing nonsingular matrices. This is a more subtle factorization than the L U
decomposition of Gaussian elimination, but has a similarly broad range of applications in
mathematics, physics and numerical analysis.
Let w1 , . . . , wn be a general basis of R n , and let u1 , . . . , un be its orthonormalization.
We assemble both sets of column vectors into the nonsingular n n matrices
A = ( w1 w2 . . . wn ),

Q = ( u1 u2 . . . un ).

Since the ui form an orthonormal basis, Q is an orthogonal matrix. Since matrix multiplication acts column-wise implies that the GramSchmidt equations (5.26) can be written
in the equivalent matrix form
r
r
... r
11

A = Q R,

where

0
R=
..
.
0

12

r22
..
.
0

1n

. . . r2n
..
..

.
.
. . . rnn

(5.36)

is an upper triangular matrix. Since the GramSchmidt process works on any basis, the
only requirement on the matrix A is that its columns form a basis of R n , and hence A
can be any nonsingular matrix. We have therefore established the Q R factorization of
nonsingular matrices.
Theorem 5.24. Any nonsingular matrix A can be factorized, A = Q R, into the
product of an orthogonal matrix Q and an upper triangular matrix R. The factorization
is unique if all the diagonal entries of R are assumed to be positive.
The proof of uniqueness is left as an exercise. We will use the compacted term positive
upper triangular to refer to such upper triangular matrices with positive entries along the
diagonal.
Example 5.25. The columns of the matrix

1 1 2
A = 1 0 2
1 2 3

(5.37)

The reader unfamiliar with the mathematical definition of a group can ignore this observation,
as it will not play a significant role in this book. However, the group concept is fundmanetal in a
broad range of mathematics and its applications, and well worth studying.

3/7/03

167

c 2003

Peter J. Olver

are the same as the basis vectors considered in Example 5.16. The orthonormal basis
(5.23) constructed using the GramSchmidt algorithm leads to the orthogonal and upper
triangular matrices

Q=

1
3
1

3
13

4
42
1

42
5
42

2
14
314
114

R=
0
0

13

14
3

21 ,

2
7
2

where the entries of R are the coefficients rij computed in the earlier example. The reader
may wish to verify that, indeed, A = Q R.

While any of the three versions of the GramSchmidt algorithm will produce the Q R
T
factorization of a given matrix A = ( w1 w2 . . . wn ) , the last version, as encoded in
equations (5.31) is the one to use, as it is the least likely to fail due to numerical artifacts
coming from round-off errors. We reformulate the algorithm purely in terms of the matrix
entries aij of A, which we display in the form of a program written in pseudocode, suitable
for practical computations. During the course of the algorithm, the entries of the matrix
A are sucessively modified, and the A similar modification of the Q R factorization leads to
the practical algorithm, which overwrites the matrix A to produce the orthogonal matrix
Q. The program is stated so that it also works for rectangular, m n, matrices, in which
case Q is an m n matrix whose columns are the orthonormal basis vectors resulting from
3/7/03

168

c 2003

Peter J. Olver

Q R Factorization of a Matrix A
start
for j = 1 to n
q
a21j + + a2nj
set rjj =

if rjj = 0, stop; print A has linearly dependent columns


else for i = 1 to n
set aij = aij /rjj
next i
for k = j + 1 to n
set rjk = a1j a1k + + anj ank
for i = 1 to n
set aik = aik aij rjk
next i

next k
next j
end

applying GramSchmidt to the columns of A and R is an n n upper triangular matrix.


Example 5.26. Let us apply the Q R algorithm stated above to the matrix A =

2 1 0 0
1 2 1 0

, using the numerically stable factorization algorithm. As in Gaussian


0 1 2 1
0 0 1 2
elimination, we work directly onthe matrix A, gradually changing it into orthogonal form.
In the first loop, we set r11 = 5 to be the norm of the first column vector; normalizing
2

1
0
0
15

2
1
0
. The next entries r = 4 ,
5
the first column, the revised matrix is
12

5
0 1 2 1

0 0 1 2
r13 = 15 , r14 = 0, are obtained by taking the dot products of the first column with
the other three columns. Then we subtract the appropriate multiple of the first column
from the other three columns to arrange that they are all orthogonal to the first; the
3/7/03

169

c 2003

Peter J. Olver

2
5
1
5

35

25

6
4

0
5
5
is a matrix whose first column is normalized to have unit
result

0
1
2 1
0
0
1 2
length, and whose second, third and fourth columns are orthogonalq
to it. In the next loop,

14
we normalize the second column by dividing by its norm r22 =
5 , and so obtaining
2

370 25 0
5
1

4
6

0
5
5
70
. We then take dot products of the second column
the matrix

0
2 1
70

5
r24 =
with the remaining two to produce r23 =
14 ; subtracting these multiples,
2
2
3

370
7
14
5
1

47 37
70
5
has its first two columns orthonormalized, and the last two

6
9
5

7
14
70
16 ,
70

0
0
1
2
orthogonal to both of the
the third column by dividing

first two. We then normalize


2
2
3
3

70
14
105

5
1
6
4
3
q

7
15
5
70
105
, and then
by r33 =
. Finally, r34 = 20
7 , and so
105
9
0
5
6
14

70
105
7

2
0
0
105
q
r44 = 56 , leading to the final formulas

2
5
1

Q=
0

370
6
70
5
70
0

2
105
4
105
6
105
7
105

for the A = Q R factorization.

130
2
30

330
4
30

R=

4
q5
14
5

1
5
16
q70

15
7

0
q
5
14
,

20
105
q
5
6

An important application of the Q R factorization is to the computation of eigenvalues,


which we discuss in Section 9.6. One can also use it to solve linear systems. Indeed, the
system A x = b becomes
Q R x = b,

and hence

R x = QT b,

since Q1 = QT is an orthogonal matrix. Since R is upper triangular, the latter system


can be solved for x by back substitution.
3/7/03

170

c 2003

Peter J. Olver

Example 5.27. Let us apply the A = Q R factorization

1
1
1

1
0
2

2 =

1
3
1
3
13

4
42
1
42
5

42

2
14

314
114

13
q
14
3


3
q
21
,
q2
7
2

that we found in Example 5.25 to solve the system A x = b = ( 0, 4, 5 ) . We first


compute


1
1
1

0
q 3
3
3
3

21
4
4 =
1
5
QT b =
q 2 .
42
42
42
7
5
2
1
3
14

14

14

We then solve the upper triangular system

Rx = 0

3
q3 3
q x

21
21

y =
2

q2
q z
7
7

13
q
14
3

by back substitution, leading to x = ( 2, 0, 1 ) . Of course, this is probably not so conveneitn for hand calculation. However, the method does offer some numerical advantages
over traditional Gaussian elimination, particularly for matrices with small, but non-zero
determinant.

5.4. Orthogonal Polynomials.


Orthogonal and orthonormal bases play a similarly useful role in the analysis of function spaces. Unlike the Euclidean space R n , most obvious bases of a (finite dimensional)
function space are typically not orthogonal with respect to any useful inner product. Thus,
the computation of an orthonormal basis of functions is a critical step towards simplifying
the analysis. The GramSchmidt process applies in the same manner as before, and leads
to the classical orthogonal polynomials and other families of orthogonal functions that
arise in approximation and interpolation theory. Other orthogonal systems of functions
play starring roles in Fourier analysis and its generalizations, in quantum mechanics, in
the solution of partial differential equations by separation of variables, and many other
applications.
In this section, we concentrate on a particularly important example of orthogonal
polynomials. Orthogonal systems of trigonometric functions will appear in Chapters 11
and 12. Orthogonal systems of special functions, including Bessel functions and spherical
harmonics, arise in our presentation of solution methods for partial differential equations
in Chapters 16 and 17.
3/7/03

171

c 2003

Peter J. Olver

The Legendre Polynomials


We shall construct an orthonormal basis for the space P (n) of polynomials of degree
n based on the L2 inner product
hp;qi =

p(t) q(t) dt.

(5.38)

(The method will work for any bounded interval, but choosing [ 1, 1 ] will lead us to a
particularly important case.) We shall apply the GramSchmidt orthogonalization process
to the elementary, but non-orthogonal monomial basis
1,
Because
k

ht ;t i =

t2 ,

t,

k+l

tn .

...

2
,
dt =
k+l+1

0,

k + l even,

(5.39)

k + l odd,

each odd degree monomial is orthogonal to all even degree monomials, but no other orthogonality applies. Let q0 (t), q1 (t), . . . , qn (t) denote the orthogonal polynomials that result from applying the GramSchmidt process to the non-orthogonal monomial basis. We
begin by setting
Z 1
2
q0 (t) = 1,
k q0 k =
q0 (t)2 dt = 2.
1

According to (5.20), the next orthogonal basis polynomial is


q1 (t) = t

h t ; q0 i
q (t) = t,
k q 0 k2 0

k q 1 k2 =

2
3

In general, the GramSchmidt formula (5.20) says we should define


qk (t) = t

k1
X

j =0

h tk ; q j i
q (t)
k q j k2 j

for

k = 1, 2, . . . .

We can then recursively compute the next few polynomials


k q 2 k2 =

q2 (t) = t2 31 ,

q3 (t) = t3 35 t,
q4 (t) = t4

6 2
7 t

3
35

k q 3 k2 =

k q 4 k2 =

8
45 ,
8
175 ,
128
11025

(5.40)
,

and so on. The reader can verify that they satisfy the correct orthogonality conditions
h qi ; q j i =
3/7/03

1
1

qi (t) qj (t) dt = 0,
172

i 6= j.
c 2003

Peter J. Olver

The polynomials q0 , q1 , q2 , . . . are known as the monic Legendre polynomials, in honor


of the 18th century French mathematician AdrienMarie Legendre who used them to study
Newtonian gravitation. Since the first n of them, namely q0 , . . . , qn1 span the subspace
P (n1) of polynomials of degree n 1, the next one, qn , is the unique monic polynomial
that is orthogonal to every polynomial of degree n 1:
h tk ; qn i = 0,

k = 0, . . . , n 1.

(5.41)

Since the monic Legendre polynomials form a basis for the space of polynomials, one
can uniquely rewrite any polynomial of degree n as a linear combination
p(t) = c0 q0 (t) + c1 q1 (t) + + cn qn (t).

(5.42)

The coefficients are simply given by inner products


1
h p ; qk i
=
ck =
k q k k2
k q k k2

1
1

p(t) qk (t) dt,

k = 0, . . . , n.

(5.43)

For example,
t4 = q4 (t) + 67 q2 (t) + 15 q0 (t) = (t4 67 t2 +

3
35 )

6
7

(t2 13 ) + 51 .

The coefficients c0 , . . . , c4 can either be obtained directly, or via the orthogonality formulae
(5.43); for example,
11025
c4 =
128

1
4

t qk (t) dt = 1,

175
c3 =
8

1
1

t4 q3 (t) dt = 0,

and so on.
The classical Legendre polynomials are certain scalar multiples of the orthogonal basis
polynomials, namely
Pk (t) =

(2 k)!
q (t)
(k!)2 k

2k

k = 0, 1, 2, . . . .

(5.44)

The normalization constant is fixed by the requirement that


Pk (1) = 1,

(5.45)

which is not so important here, but does play a role in other applications. The first few

A polynomial is called monic if it has leading coefficient equal to 1.

3/7/03

173

c 2003

Peter J. Olver

-1

-1

1.5

1.5

1.5

0.5

0.5

0.5

-0.5

0.5

-1

-0.5

0.5

-1

-0.5

-0.5

-0.5

-0.5

-1

-1

-1

-1.5

-1.5

-1.5

1.5

1.5

1.5

0.5

0.5

0.5

-0.5

0.5

-1

-0.5

0.5

-1

-0.5

-0.5

-0.5

-0.5

-1

-1

-1

-1.5

-1.5

-1.5

Figure 5.2.

0.5

0.5

The Legendre Polynomials P0 (t), . . . , P5 (t).

classical Legendre polynomials are


k P0 k2 = 2,

P0 (t) = 1,

k P 1 k2 =

P1 (t) = t,
P2 (t) =
P3 (t) =
P4 (t) =
P5 (t) =
P6 (t) =

1
3 2
2t 2,
5 3
3
2 t 2 t,
35 4
15 2
3
8 t 4 t + 8,
63 5
35 3
15
8 t 4 t + 8 t.
231 6
315 4
105 2
16 t 16 t + 16 t

k P 2 k2 =

k P 3 k2 =

k P 4 k2 =

5
16

k P 4 k2 =

k P 6 k2 =

2
3,
2
5,
2
7,
2
9,
2
11 ,
2
13 .

They are graphed in Figure 5.2.


There is, in fact, an explicit formula for the Legendre polynomials, due to the early
nineteenth century Portuguese mathematician Olinde Rodrigues.
Theorem 5.28. The Rodrigues formula for the classical Legendre polynomials along
with their norms is
r
1
dk
2
(5.46)
Pk (t) = k
(t2 1)k ,
k Pk k =
,
k = 0, 1, 2, . . . .
k
2 k! dt
2k + 1
Thus, for example,
P4 (t) =
3/7/03

d4 2
1 d4 2
1
4
(t

1)
=
(t 1)4 =
16 4! dt4
384 dt4
174

35 4
8 t

15 2
4 t

c 2003

+ 83 .
Peter J. Olver

Proof : Let
Rj,k (t) =

dj 2
(t 1)k ,
dtj

which is evidently a polynomial of degree 2 kj. In particular, the Rodrigues formula (5.46)
claims that Pk (t) is a multiple of Rk,k (t). Note that
d
R (t) = Rj+1,k (t).
dt j,k

(5.47)

Moreover,
Rj,k (1) = 0 = Rj,k ( 1)

whenever

j < k,

(5.48)

since, by the product rule, differentiating (t2 1)k a total of j < k times still leaves at
least one factor of t2 1 in each summand, which therefore vanishes at t = 1.
Lemma 5.29. If j k, then the polynomial Rj,k (t) is orthogonal to all polynomials
of degree j 1.
Proof : In other words,
Z
i
h t ; Rj,k i =

1
1

ti Rj,k (t) dt = 0,

for all

(5.49)

0 i < j k.

0
Since j > 0, we use (5.48) to write Rj,k (t) = Rj1,k
(t). Integrating by parts,
i

h t ; Rj,k i =

1
1

= it

0
ti Rj1,k
(t) dt

i1

0
(t)
Rj1,k

t = 1

ti1 Rj1,k (t) dt = i h ti1 ; Rj1,k i,

where the boundary terms vanish owing to (5.48). We then repeat the process, and eventually
h ti ; Rj,k i = i h ti1 ; Rj1,k i

= i(i 1) h ti2 ; Rj2,k i = = i(i 1) 2 h 1 ; Rj2,k i = 0,

which proves the lemma.

Q.E.D.

In particular, Rk,k (t) is a polynomial of degree k which is orthogonal to every polynomial of degree k 1. By our earlier remarks, this implies that it is a consatant
multiple,
Rk,k (t) = ck Pk (t)
of the k th Legendre polynomial. To determine ck , we need only check the highest order
terms:
dk
(2 k)! k
dk
t + ,
Rk,k (t) = k (t2 1)k = k (t2 k + ) =
dt
dt
k!
3/7/03

175

c 2003

Peter J. Olver

while
Pk (t) =

(2 k)! 2 k
t + .
2k k!

We conclude that ck = 1/(2k k!), which proves (5.46). The proof of the formula for k Pk k
can be found in Exercise .
Q.E.D.
The Legendre polynomials play an important role in many aspects of applied mathematics, including numerical analysis, least squares approximation of functions, and solution
of partial differential equations, and we shall encounter them again later in the book.
Other Systems of Orthogonal Polynomials
The standard Legendre polynomials form an orthogonal system with respect to the
L inner product on the interval [ 1, 1 ]. When dealing with any other interval, or, more
generally, a weighted inner product between functions on an interval, leads to a different
set of orthogonal polynomials. In all cases, applying the GramSchmidt process to the
standard monomials 1, t, t2 , t3 , . . . will produce the desired orthogonal system. However,
in some cases, a change of variables may be used to relate the orthogonal polynomials to
the Legendre polynomials circumvent the required GramSchmidt computation.
Suppose our goal is to construct an orthogonal system of polynomials for the L 2 inner
product
Z b
f (t) g(t) dt
hf ;gi =
2

on the interval [ a, b ]. The key remark is that we can map the interval [ 1, 1 ] to [ a, b ] by
a simple linear change of variables of the form s = + t. Specifically, if
s=

2t b a
ba

will change

0t1

to

1 s 1.

(5.50)

The map changes functions F (s), G(s) defined for 1 s 1 into the functions

2t b a
2t b a
f (t) = F
,
g(t) = G
,
ba
ba

(5.51)

defined for a t b. Moreover, interpreting (5.50) as a change of variables for the


integrals, the inner products are related by

Z b
Z b
2t b a
2t b a
F
f (t) g(t) dt =
hf ;gi =
G
dt
ba
ba
a
a
(5.52)
Z 1
2
2
=
F (s) G(s)
ds =
h F ; G i,
ba
ba
1
where the final L2 inner product is over the interval [ 1, 1 ]. In particular, the change of
variables maintains orthogonality, while rescaling the norms:
r
ba
(5.53)
hf ;gi = 0
if and only if
h F ; G i = 0,
kf k =
k F k.
2
3/7/03

176

c 2003

Peter J. Olver

Note that, under the change of variables (5.51), if f (t) is a polynomial of degree n in
t, then F (s) is a polynomial of degree n in s and vice versa. Applying these observations
to the Legendre polynomials, we immediately deduce the following.
Proposition 5.30. The modified Legendre polynomials

2
t

a
,
k = 0, 1, 2, . . . ,
Pek (t) = Pk
ba

(5.54)

form an orthogonal system of polynomials on the interval [ a, b ] with respect to the L 2


inner product. Moreover,
r
ba
k Pek k =
.
(5.55)
2k + 1
The formula for the norm follows from combining (5.46), (5.53).
Example 5.31. As an example, consider the L2 inner product
hf ;gi =

f (t) g(t) dt

(5.56)

on the interval [ 0, 1 ]. The map s = 2 t 1 will change 0 t 1 to 1 s 1. According


to Proposition 5.30, this change of variables will convert the Legendre polynomials P k (s)
into an orthogonal system of polynomials
r
1
2
.
Pek (t) = Pk (2 t 1),
with corresponding L norms
k Pek k =
2k + 1

on the interval [ 0, 1 ]. The first few are


Pe0 (t) = 1,

Pe1 (t) = 2 t 1,

Pe2 (t) = 6 t2 6 t + 1,

Pe3 (t) = 20 t3 30 t2 + 12 t 1,

Pe4 (t) = 70 t4 140 t3 + 90 t2 20 t + 1,

Pe5 (t) =

63 5
8 t

35 3
4 t

15
8

(5.57)

t.

5.5. Orthogonal Projections and Least Squares.


We have already encountered the problem of finding the point on a prescribed subspace
that lies closest to a given point. In this section, we shall discover an important geometrical
interpretation of our solution. Namely, that the closest point is the orthogonal projection
of the point onto the subspace. Furthermore, if we adopt an orthogonal, or, even better,
orthonormal basis for the subspace, then the orthogonal projection or closest point has
a very elegant, explicit formula. In this manner, orthogonality allows us to effectively
bypass the normal equations and solution formulae that were so laboriously computed in
Chapter 4. The orthogonal projection formulae have important practical consequences for
the solution of a wide range of least squares minimization problems.
3/7/03

177

c 2003

Peter J. Olver

W
Figure 5.3.

The Orthogonal Projection of a Vector onto a Subspace.

Orthogonal Projection
We begin by characterizing the orthogonal projection in geometrical terms. Throughout this section, we are given a finite-dimensional subspace W V of an inner product
space V .
A vector z V is said to be orthogonal to the subspace W if it is orthogonal to every
vector in W , so h z ; w i = 0 for all w W . Given a basis w1 , . . . , wn for W , we note that
z is orthogonal to W if and only if it is orthogonal to every basis vector: h z ; w i i = 0, for
i = 1, . . . , n. Indeed, any other vector in W has the form w = c1 w1 + + cn wn and
hence, by linearity, h v ; w i = c1 h v ; w1 i + + cn h v ; wn i = 0, as required.
Definition 5.32. The orthogonal projection of v onto the subspace W is the element
w W that makes the difference z = v w orthogonal to W .
The geometrical configuration underlying orthogonal projection is illustrated in Figure 5.3. Let us show how to explicitly construct the orthogonal projection of a vector on a
subspace. The result is greatly simplified by taking a orthonormal basis of the subspace,
which, if necessary, can be arranged by applying the GramSchmidt process to a known
basis of the subspace. Alternatively, a direct construction of the orthogonal projection in
terms of a general basis can be found in Exercise .
Theorem 5.33. Let u1 , . . . , un be an orthonormal basis for the subspace W V .
Then the orthogonal projection of a vector v V onto W is
w = c 1 u1 + + c n un

where

ci = h v ; ui i,

i = 1, . . . , n.

(5.58)

While the subspace is necessarily finite-dimensional, the inner product space itself may be
infinite-dimensional. Initially, though, the student may wish to use V = R n with the ordinary
Euclidean dot product for illustrative purposes.

3/7/03

178

c 2003

Peter J. Olver

Proof : First, since u1 , . . . , un form a basis of the subspace, the orthogonal projection
element w must be a linear combination of them, as in the formula. According to the
definition, the difference z = v w must be orthogonal to W , which requires that it be
orthogonal to every basis vector of W . Therefore, by orthonormality of the basis elements,
0 = h z ; u i i = h v ; u i i h w ; u i i = h v ; u i i h c 1 u1 + + c n un ; u i i
= h v ; u i i c 1 h u1 ; u i i c n h un ; u i i = h v ; u i i c i ,
for each i = 1, . . . , n. We conclude that the coefficients ci = h v ; ui i of w are uniquely
prescribed by the orthogonality requirement, and so the orthogonal projection w is, in
fact, unique.
Q.E.D.
More generally, if we only have an orthogonal basis v1 , . . . , vn for W , then the same
argument show that the orthogonal projection of v onto W is given by
w = a 1 v1 + + a n vn ,

where

ai =

h v ; vi i
,
k v i k2

i = 1, . . . , n.

(5.59)

Of course, we could also replace the orthogonal basis by an orthonormal basis by dividing
each vector by its length: ui = vi /k vi k. The reader should be able to prove that the two
formulae (5.58), (5.59) for the orthogonal projection give the same vector w.
Example 5.34. Consider the plane W R 3 spanned by the orthogonal vectors

1
1
v 2 = 1 .
v1 = 2 ,
1
1
T

According to (5.59), the orthogonal projection of v = ( 1, 0, 0 ) onto W is


1
1
1
2
h v ; v1 i
h v ; v2 i
1
1
v +
v = 2 + 1 = 0 .
w=
k v 1 k2 1
k v 2 k2 2
6
3
1
1
1
2
Alternatively, we can replace v1 , v2 by an orthonormal basis
1

62
v1
u1 =
= 6
,
k v1 k

u2 =

1
6

Then, using (5.58),

1
w = h v ; u 1 i u1 + h v ; u 2 i u2 =
6
3/7/03

179

1
62

6
1
6

v2
=
k v2 k

+ 1

1
3
1
3
1
3

1
3
1
3
1
3

2
= 0 .

1
2

c 2003

Peter J. Olver

The answer is, of course, the same. As the reader may notice, while the theoretical formula
is simpler when written in an orthonormal basis, for hand computations the orthogonal
basis version avoids lots of square roots. (Of course, when performing the computation on
a computer, this is not usually a significant issue.)
An intriguing observation is that the coefficients in (5.58) and (5.59) have exactly
the same formulae, (5.5), (5.7), as if the vector v were an element of the subspace W .
Indeed, if v W , then it would coincde with its orthogonal projection, w = v, and hence
the orthogonal projection formulae would reduce to the earlier basis formulae as a specail
case.
The same formulae also occur in our implementation of the GramSchmidt algorithm;
see (5.19) and (5.29). Thus, we find a useful geometric interpretation for the GramSchmidt
construction. Comparing with (5.59), we find that the GramSchmidt formula (5.20) has
the form
vk = w k zk ,
where zk is the orthogonal projection of wk onto the subspace Vk1 = span {v1 , . . . , vk1 } =
span {w1 , . . . , wk1 } spanned by the preceding k 1 basis vectors v1 , . . . , vk1 . The resulting vector is, by construction, orthogonal to the subspace, and hence orthogonal to all
of the previous basis elements.
Orthogonal Least Squares
The orthogonal projection of a vector onto a subspace is also the least squares vector
the closest point in the subspace!
Theorem 5.35. Let W V be a finite-dimensional subspace of an inner product
space. Given a vector v V , the closest point w W is the same as the orthogonal
projection of v onto W .
Proof : Let w W be the orthogonal projection of v onto the subspace, which requires
e W is any other vector in
that the difference z = v w be orthogonal to W . Suppose w
the subspace. Then,
e k2 = k w + z w
e k2 = k w w
e k2 + 2 h w w
e ; z i + k z k2 = k w w
e k2 + k z k 2 .
kv w

e ; z i = 0 vanishes because z is orthogonal to every vector


The inner product term h w w
e Since z = v w is uniquely prescribed by the vector v, the second
in W , including w w.
e W . Therefore, k v w
e k2
term k z k2 does not change with the choice of the point w
e k2 is minimized. Since w
e W is allowed to be
will be minimized if and only if k w w
e k2 = 0 occurs when w
e = w.
any element of the subspace W , the minimal value k w w
Thus, the closest point or least squares minimizer is the orthogonal projection.
Q.E.D.

In particular, if we have found an orthogonal or orthonormal basis of our subspace,


then we can compute the closest least squares point w W to v our preceding formulae
(5.58) or (5.59). in this manner, orthogonal bases have a dramatic simplifying effect on
the least squares approximation formulae.
3/7/03

180

c 2003

Peter J. Olver

Example 5.36. Consider the least squares problem of finding the closest point w to
T
the vector v = ( 1, 2, 2, 1 ) in the three-dimensional subspace spanned by the orthogonal
vectors
T

v1 = ( 1, 1, 2, 0 ) ,

v2 = ( 0, 2, 1, 2 ) ,

v3 = ( 1, 1, 0, 1 ) .

Since the spanning vectors are orthogonal (but not orthonormal), we can use the orthogonal
projection formula (5.59) to find the linear combination w = a1 v1 + a2 v2 + a3 v3 . Thus,
a1 =

3
1
h v ; v1 i
= = ,
2
k v1 k
6
2

a2 =

h v ; v2 i
4
= ,
2
k v2 k
9

and hence
w=

1
2

v1 + 49 v2 + 43 v3 =

is the closest point in the least squares sense.

11
6

31 13 4
, 18
, 9,9

a3 =

h v ; v3 i
4
= ,
2
k v3 k
3

Even when we are given a basis for the subspace that is not orthogonal, it may still
be a good strategy to first use GramSchmidt to find an orthogonal or even orthonormal
basis, and then apply the orthogonal projection formulae. Not only does this simplify
the final computation, it will often avoid the ill-conditioning and numerical innaccuracy
that often follow from directly solving the normal equations (4.29). The following example
illustrates this alternative procedure.
Example 5.37. Let us return to the problem, solved in Example 4.8, of finding
T
T
the closest point on plane V spanned by w1 = ( 1, 2, 1 ) , w2 = ( 2, 3, 1 ) to b =
T
( 1, 0, 0 ) . We proceed now by first using To find the closest point on the given plane to
b, we first combine the basis vectors GramSchmidt to compute the orthogonal basis
T

v1 = w1 = ( 1, 2, 1 ) ,

v 2 = w2

Therefore, applying (5.59), the closest point is


v? =

w2 v 1
.
w1 = 52 , 2, 32
2
k v1 k

b v1
b v1
1
7 T
v
+
v
=
,

,
1
2
3
15
15
k v 1 k2
k v 1 k2

reconfirming the result of Example 4.8. In this approach, we manage to avoid solving any
linear equations.
Let us revisit now the problem of approximating data by a least squares minimization
procedure. The required calculations are significantly simplified by the introduction of an
orthonormal basis of the least squares subspace. Given sample points t 1 , . . . , tm , let
T

tk = tk1 , tk2 , . . . , tkm ,

k = 0, 1, 2, . . .

be the vectors obtained by sampling the monomial tk . More generally, sampling a polynomial
y = p(t) = 0 + 1 t + + n tn
(5.60)
3/7/03

181

c 2003

Peter J. Olver

results in the same linear combination


T

p = ( p(t1 ), . . . , p(tn ) ) = 0 t0 + 1 t1 + + n tn

(5.61)

of sample vectors. We
that the set of sampled polynomials forms a subspace
conclude
m
T = span t0 , . . . , tn R spanned by the monomial sample vectors.
T

Let y = ( y1 , y2 , . . . , ym ) denote a set of data measured at the sample points. The


polynomial least squares approximation to the given data is, by definition, the polynomial
y = p(t) whose corresponding sample vector p T is the closest point belonging to the
sample subspace T . The sampled monomial vectors t0 , . . . , tn are not orthogonal, and so
one must solve the usual normal equations (4.36) in order to find the desired least squares
coefficients 0 , . . . , n .
An alternative approach is to first use the GramSchmidt procedure to construct an
orthogonal basis for the subspace T , from which the least squares coefficients are found by
simply taking an appropriate inner product. Let us adopt the rescaled version
m
1 X
v w = vw
hv;wi =
m i=1 i i

(5.62)

of the standard dot product on R m . Note that h v ; w i is equal to the average value of the
product function v(t) w(t) on the sample points. In particular, the inner product between
our monomial basis vectors corresponding to sampling tk and tl is
h tk ; t l i =

m
m
1 X k+l
1 X k l
ti ti =
t
= tk+l ,
m i=1
m i=1 i

(5.63)

which is the averaged sample value of the monomial tk+l .


To keep the formulae reasonably simple, let us further assume that the sample points
are evenly spaced and symmetric about 0. The second requirement means that if t i is
a sample point, so is ti = tmi+1 . An example would be the seven sample points
3, 2, 1, 0, 1, 2, 3. As a consequence of these two assumptions, the averaged sample
values of the odd powers of t vanish:
t2i+1 = 0.
Hence, by (5.63), the sample vectors tk and tl are orthogonal whenever k + l is odd.
Applying the GramSchmidt algorithm to t0 , t1 , t2 , . . . produces the orthogonal basis
vectors q0 , q1 , q2 , . . . . Each qk = (qk (t1 ), . . . , qk (tm )) can be interpreted as the sample
vector for a certain interpolating polynomial qk (t) of degree k. The following table shows

For a weighted least squares problem, we accordingly weight the inner product.

The method works without these particular assumptions, but the formulas become more
unwieldy; see Exercise .

3/7/03

182

c 2003

Peter J. Olver

the first few polynomials qk (t), their corresponding orthogonal sample vectors, along with
their squared norms, k qk k2 = qk (t)2 :
k q0 k2 = 1,

q0 (t) = 1,

q 0 = t0 ,

q1 (t) = t,

q 1 = t1 ,

q2 (t) = t2 t2 ,

q 2 = t 2 t 2 t0 ,

q3 (t) = t3

t4
t2

t,

q 3 = t3

t4
t2

k q 1 k2 = t2 ,

2
k q 2 k 2 = t 4 t2 ,
2
t4
k q 3 k2 = t6
.
t2

t1 ,

(5.64)

With these in hand, the least squares approximating polynomial of degree n to the
given data vector y is, then, given by a linear combination
p(t) = c0 q0 (t) + c1 q1 (t) + c2 q2 (t) + + cn qn (t).

(5.65)

According to (5.59), the required coefficients are given by taking inner products
ck =

h qk ; y i
q y
= k .
2
k qk k
qk2

(5.66)

Thus, once we have arranged the orthogonal system, there are no linear systems to be
solved to find the least squares approximation.
An additional advantage of this approach is that one can readily increase the degree,
and, presumably, the accuracy, of the approximating polynomial without having to recompute the lower degree terms. For instance, if a quadratic approximant c 0 +c1 q1 (t)+c2 q2 (t)
looks insufficiently close, one can add in the cubic term c3 q3 (t) with c3 given by (5.66)
for k = 3, without having to recompute the quadratic coefficients c0 , c1 , c2 . This simplification is not valid when using the non-orthogonal basis elements, where the lower order
coefficients will change if the degree of the approximating polynomial is increased.
Example 5.38. Consider the following tabulated sample values:
ti

yi

1.4

1.3

.6

.1

.9

1.8

2.9

To compute polynomial least squares fits of degrees 3, we begin by computing the


polynomials (5.64), which for the given sample points ti are
q3 (t) = t3 7 t ,
216
k q0 k2 = 1,
k q1 k2 = 4,
k q2 k2 = 12,
k q 3 k2 =
.
7
Thus, the coefficients (5.66) for the least squares approximation (5.65) are
q0 (t) = 1,

q1 (t) = t,

q2 (t) = t2 4,

1
h q ; y i = 0.7357,
4 1
7
c3 =
h q ; y i = 0.0083.
216 3

c0 = h q0 ; y i = 0.1714,
c2 =
3/7/03

c1 =

1
h q ; y i = 0.0024,
12 2
183

c 2003

Peter J. Olver

To obtain the best linear approximation, we use


p1 (t) = c0 q0 (t) + c1 q1 (t) = 0.1714 + 0.7357 t,
with a least squares error of 0.7081. Similarly, the quadratic and cubic least squares
approximations are
p2 (t) = 0.1714 + 0.7357 t + 0.0024(t2 4),

p3 (t) = 0.1714 + 0.7357 t + 0.0024(t2 4) 0.0083(t3 7 t),

with respective least squares errors 0.2093 and 0.1697 at the sample points. A plot of the
three approxiations appears in Figure osa .
A key observation is that the same coefficients cj appear in all the least squares
formulae. thus, unlike the direct approach, we are not required to recompute any of the
lower order coefficients when increasing the order of the approximating polynomial. thus,
if we decided that a cubic polynomial was insufficently accurate, we would only need to
compute the coefficient c4 corresponding to the fourth order orthogonal polynomial to
obtain the quartic least squares approximation.
Orthogonal Polynomials and Least Squares
In a similar fashion, one uses Legendre polynomials and more general orthogonal
systems of polynomials and functions to simplify the determination of least squares approximants in function spaces. Let us now reconsider the problem, from Chapter 4, of
approximating et by a polynomial of degree n. For the interval 1 t 1, we write the
best least squares approximant as a linear combination of Legendre polynomials,

(5.67)
p(t) = c0 P0 (t) + c1 P1 (t) + + cn Pn (t) = c0 + c1 t + c2 32 t2 12 + .
Since the Legendre polynomials form an orthogonal basis, the least squares coefficients can
be immediately computed by the inner product formula (5.59), so
2k + 1
h et ; Pk i
=
ck =
2
k Pk k
2

1
1

et Pk (t) dt.

For example, the quadratic approximation is obtained from the first three terms in (5.67),
where

Z
1 1 t
1
1
c0 =
e dt =
e
' 1.175201,
2 1
2
e
Z
3
3 1 t
t e dt =
' 1.103638,
c1 =
2 1
e

Z
5
7
5 1 3 2 1 t
c2 =
t 2 e dt =
e
' .357814.
2 1 2
2
e
Therefore

et 1.175201 + 1.103638 t + .357814


3/7/03

184

2
2 t

1
2

c 2003

(5.68)
Peter J. Olver

2.5

2.5

2.5

1.5

1.5

1.5

0.5

0.5

0.5

0.2

0.4

0.6

0.8

0.2

Figure 5.4.

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Least Squares Approximation to et .

gives the quadratic least squares approximation to et on [ 1, 1 ]. Graphs appear in


Figure 5.4; the first graph shows et , the second (5.68), and the third lays the two graphs
on top of each other.
There are two major advantages of the orthogonal Legendre approach over the direct
approach presented in Example 4.23. First, there is no linear system of equations to solve.
Indeed, the coefficient matrix for polynomial least squares approximation based on the
monomial basis is some variant of the notoriously ill-conditioned Hilbert matrix, (1.72),
and the computation of an accurate solution is particularly tricky. Our precomputation
of an orthogonal system of polynomials has successfully circumvented the Hilbert matrix,
and entirely avoided solving any linear system!
The second advantage was already mentioned in the preceding subsection. Unlike the
direct approach, the coefficients ck do not change if we desire to go to higher accuracy by
increasing the degree of the approximating polynomial. For instance, in the first case, if the
quadratic approximation (5.68) is not accurate enough, we can add in a cubic correction

c3 P3 (t) = c3 25 t3 32 t ,
where we compute the required coefficient by

Z
7 1 5 3 3 t
7
5
c3 =
t 2 t e dt =
37e
' .070456.
2 1 2
2
e

Unlike the monomial basis, we do not need to recompute the coefficients c 0 , c1 , c2 . The
successive Legendre coefficients decrease fairly rapidly:
c0 ' 1.175201,
c4 ' .009965,

c1 ' 1.103638,
c5 ' .001100,

c2 ' .357814,

c6 ' .000099,

c3 ' .070456,

leading to greater and greater accuracy in the approximation. The detailed reason for this
will be explained in Chapter 11.
Finally, to apply this method to the problem of approximating the function e t the
interval [ 0, 1 ] considered in Example 4.23. To construct the least squares approximants
for the L2 norm, we use the rescaled Legendre polynomials constructed in (5.57). The
polynomial least squares approximation of degree n to a function f (t) on [ 0, 1 ] is thus
given by
c0 + c1 Pe1 (t) + c2 Pe2 (t) + + cn Pen (t),
3/7/03

185

c 2003

Peter J. Olver

where

Z 1
h f ; Pek i
= (2 k + 1)
ck =
f (t) Pek (t) dt.
k Pe k2
0
k

For the particular function et , we find


Z 1
et dt = e 1 ' 1.71828,
c0 =
0

c1 = 3
c2 = 5

(2 t dt 1) et = 3(3 e) ' .845155,

0
1
0

(6 t2 6 t + 1)et dt = 5(7 e 19) ' .139864.

Thus, the best quadratic least squares approximation is


p?2 (t) = 1.71828 + .845155 (2 t 1) + .139864 (6 t2 6 t + 1)
= 1.012991 + .851125 t + .839184 t2 .
It is worth emphasizing that this is the same approximating polynomial as we computed in
(4.59). The use of an orthogonal system of polynomial merely simplifies the computation.
Moreover, if we decide to replace the quadratic approximant by a cubic, we do not have
to recompute c0 , c1 , c2 , but only

Z 1
5
7
3
2
t
37e
' .013931.
(20 t 30 t + 12 t 1) e dt =
c3 = 7
2
e
0
Thus, the cubic least squares approximant is
p?3 (t) = p?2 (t) + .013931 (20 t3 30 t2 + 12 t 1)

= 0.999060 + 1.018300 t + 0.421246 t2 + 0.278625 t3 .

5.6. Orthogonal Subspaces.


We now extend the notion of orthogonality from individual elements to subspaces of
an inner product space V .
Definition 5.39. Two subspaces W, Z V are called orthogonal if every vector in
W is orthogonal to every vector in Z.
In other words, W and Z are orthogonal subspaces if and only if h w ; z i = 0 for every
w W, z Z. In practice, one only needs to check orthogonality of basis elements. Thus,
if w1 , . . . , wk is a basis for W and z1 , . . . , zl a basis for Z, then W and Z are orthogonal
if and only if h wi ; zj i = 0 for all i = 1, . . . , k and j = 1, . . . , l.
1. The plane W R 3 defined by the equation 2 x y + 3 z = 0 is orthogonal to the line
T
T
Z = span n spanned by its normal vector n = ( 2, 1, 3 ) . Indeed, every w = ( x, y, z )
W satisfies the orthogonality condition w n = 2 x y + 3 z = 0, which is just the equation
for the plane.
3/7/03

186

c 2003

Peter J. Olver

Figure 5.5.

Orthogonal Complement to a Line.

Definition 5.40. The orthogonal complement to a subspace W V , denoted W ,


is defined as the set of all vectors which are orthogonal to W , so
W = { v V | h v ; w i = 0 for all w W } .
One easily checks that the orthogonal complement W to a subspace W V is also
a subspace.
T

Example 5.41. Let W = { ( t, 2 t, 3 t ) | t R } be the line (one-dimensional subT


space) in the direction of the vector w = ( 1, 2, 3 ) R 3 . The orthogonal complement
W will be the plane passing through the origin having normal vector w; see Figure 5.5.
T
In other words, v = ( x, y, z ) W if and only if
v w = x + 2 y + 3 z = 0.

(5.69)

Thus W is characterized as the solution space to the homogeneous linear system (5.69),
or, equivalently, the kernel of the 1 3 matrix A = w T = ( 1, 2, 3 ).
We can write the general solution to the system in the form

2y 3z
2
3
= y 1 + z 0 = y v1 + z v2 ,
v=
y
z
0
1
where y, z are the free variables. The indicated solution vectors
T

v1 = ( 2, 1, 0 ) ,

v2 = ( 3, 0, 1 ) ,

form a (non-orthogonal) basis for the orthogonal complement W .


Proposition 5.42. If dim W = m and dim V = n, then dim W = nm. Moreover,
every vector v V can be uniquely decomposed into v = w +z where w W and z W
are the orthogonal projections of v onto the respective subspaces.
3/7/03

187

c 2003

Peter J. Olver

Proof : Let u1 , . . . , un be an orthonormal basis for V . Let w1 , . . . , wm be a basis for


W , which we can write
wi =

n
X

aij uj ,

i = 1, . . . , m,

j =1

in terms of the prescribed orthonormal basis of V . A vector v = x1 u1 + + xn un W


belongs to its orthogonal complement if and only if
h wi ; v i =

n
X

j,k = 1

aij xk h uj ; uk i =

n
X

aij xj = 0,

i = 1, . . . , m.

(5.70)

j =1
T

Therefore the coordinates x = ( x1 , x2 , . . . , xn ) of v satisfy the homogeneous linear system


A x = 0. We can thereby identif elements of W with vectors x belonging to the kernel
of the matrix A. Furthermore, since w1 , . . . , wm are linearly independent, the matrix
A = (aij ) has rank m. The Fundamental Theorem 2.47 then implies that ker A = W
has dimension n m, while W = corng A has dimension m and forms a complementary
subspace.
To verify the last statement, we let w be the orthogonal projection of v onto W ,
which means that z = v w is orthogonal to W and hence an element of W . Uniqueness
of the decomposition follows from the fact that the orthogonal projection is uniquely
defined.
Q.E.D.
Corollary 5.43. If dim V < then the orthogonal complement of W is equal to
W = (W ) .
Warning: Corollary 5.43 is not necessarily true for infinite-dimensional vector spaces.
In general, if dim W = , one can only assert that W (W ) . For example, it
can be shown that, [oc], on any bounded interval [ a, b ] the orthogonal complement to the
subspace of all polynomials P () in C0 [ a, b ] with respect to the L2 inner product is trivial:
(P () ) = {0}. This means that the only continuous function which satisfies
n

h x ; f (x) i =

xn f (x) dx = 0,

for all

n = 0, 1, 2, . . .

is the zero function f (x) 0. But the orthogonal complement of {0} is the entire space,
and so ((P () ) ) = C0 [ a, b ] 6= P () .
Orthogonality of the Fundamental Matrix Subspaces and the Fredholm Alternative
We have already encountered the four fundamental subspaces associated with an mn
matrix A. The first two, the kernel and the corange, are subspaces of R n having complementary dimensions. The second two, the cokernel and the range, are subspaces of R m ,
also of complementary dimensions. In fact, more than this is true the pairs consist of
orthogonal complementary subspaces with respect to the standard Euclidean dot product.
3/7/03

188

c 2003

Peter J. Olver

Theorem 5.44. Let A be an m n matrix of rank r. Then its kernel and corange
are orthogonal complements as subspaces of R n , of respective dimensions n r and r,
while its cokernel and range are orthogonal complements in R m , of respective dimensions
m r and r:
corng A = (ker A) R n ,

rng A = (coker A) R m .

(5.71)

Proof : A vector x R n lies in ker A if and only if A x = 0. Let rT1 , . . . , rTm R n


denote the rows of A. According to the rules of matrix multiplication, the i th entry of A x
equals the product rTi x = ri x = 0, which vanishes if and only if x is orthogonal to r i .
Therefore, x ker A if and only if x is orthogonal to all the rows of A. Since the rows
span corng A = rng AT , this is equivalent to the statement that x lies in the orthogonal
complement (corng A) . The proof for the range and cokernel follows from the same
argument applied to AT .
Q.E.D.
Combining Theorems 2.47 and 5.44, we deduce the following important characterization of compatible linear systems, known as the Fredholm alternative, named after the
Swedish mathematician Ivar Fredholm. Fredholms work was devoted to integral equations, but his alternative is applicable to very general linear systems, including differential
equations, integral equations, variational problems, and many others.
Theorem 5.45. The linear system A x = b has a solution if and only if b is orthogonal to the cokernel of A.
Therefore, the compatibility conditions for the linear system A x = b can be written
in the form
yb=0
for every y satisfying
AT y = 0.
(5.72)
In practice, one only needs to check orthogonality of the vector b with respect to a basis
y1 , . . . , ymr of the cokernel, leading to a system of m r compatibility constraints, where
r = rank A denotes the rank of the coefficient matrix.
Example 5.46. The coefficient matrix

x1
1 0 1
x = x2 ,
A = 0 1 2 ,
x3
1 2 3

b1
b = b2 ,
b3

of the system A x = b that was considered in Example 2.41 has rank r = 2. The single
compatability condition for the system to have a solution was found by Gaussian elimination:
b1 + 2 b2 + b3 = 0.
(5.73)
We now understand the meaning behind this equation: it is telling us that the right hand
side b must be orthogonal to the cokernel of A. The cokernel is the subspace of vectors
y satisfying the homogeneous adjoint system AT y = 0. The solution the line spanned by
the vector y1 = (1, 2, 1)T , and (5.73) says that b must be orthogonal to y1 , in accordance
with the Fredholm Theorem 5.45.
3/7/03

189

c 2003

Peter J. Olver

Example 5.47. Consider the linear system A x = b with coefficient matrix

1 1
3
2
A=
.
1 1
1
2

Solving the homogeneous adjoint equation AT y = 0 by Gaussian elimination, we produce


a basis
T
T
y1 = ( 1 0 0 1 ) ,
y2 = ( 5 3 1 0 ) ,
for coker A. Therefore, the system A x = b is compatible if and only if
y1 b = b1 + b4 = 0,

y2 b = 5 b1 + 3 b2 + b3 = 0.

The reader can check that these are indeed the same compatibility
conditions
that result

from a direct Gaussian elimination on the augmented matrix A | b .

Suppose A is an m n matrix of rank r. Then A maps R n to the range subspace


rng A R m . The elements of ker A are all mapped to the zero vector, and so the elements
of the complementary subspace corng A R n must fill all of rng A. Both the range and
the corange have the same dimension, namely r, and as a consequence, restricting A to
the corange defines a one-to-one map.
Proposition 5.48. The restriction A: corng A rng A defines a one-to-one map
between subspaces. Moreover, if v1 , . . . , vr forms a basis of corng A then their images
A v1 , . . . , A vr form a basis for rng A.
Proof : If v, w corng A satisfy A v = A w, then A(v w) = 0 and hence v w
ker A. but the only vector in the intersection of the kernel and corange is the zero vector,
and hence v = w. Moreover, given any b rng A, we have b = A x for some x R n . We
decompose x = v + z where v corng A and z ker A, whence b = A x = A v, and hence
every vector in the range comes from a vector in the corange.
Q.E.D.

3/7/03

190

c 2003

Peter J. Olver

Chapter 6
Equilibrium
In this chapter, we present some significant applications of linear algebra to the analysis of simple mechanical structures and electrical circuits. We will discover that there are
remarkable analogies between electrical and mechanical systems. Both systems fit into a
very general mathematical framework which, when suitably formulated, will also apply in
the continuous realm, and ultimately govern the equilibria of systems arising throughout
physics and engineering. The key difference is that discrete structures and circuits are governed by linear algebraic equations, whereas continuous media are modeled by differential
equations and boundary value problems.
In the mechanical and electrical systems treated in the present chapter, the linear system governing the equilibrium configuration has the same structure: the coefficient matrix
is of general Gram form, and thereby positive (semi-)definite. The positive definite cases
correspond to stable structures and circuits, which can support any external forcing, and
possess a unique stable equilibrium solution that can be characterized by a minimization
principle. In physical language, the system seeks to minimize its potential energy. On the
other hand, the positive semi-definite cases correspond to unstable structures and circuits
that cannot remain in equilibrium except for very special configurations of external forces.
In the case of mechanical structures, the instabiliuties are of two types: rigid motions,
under which the structure maintains its overall shape, and mechanisms.
We begin by analyzing in detail a linear chain of masses interconnected by springs.
The basic mathematical framework is already manifest in this simple mechanical structure.
Next, we consider simple electrical circuits consisting of resistors and current sources interconnected by a network of wires. Finally, we treat two and three-dimensional structures
constructed out of elastic bars. In all cases, we only consider the equilibrium configurations;
dynamical processes for each of the physical systems will be taken up in Chapter 8.

6.1. Springs and Masses.


Consider a massspring chain consisting of n masses
m1 , m 2 , . . . m n
arranged in a straight line. For simplicity, we will only allow the masses to move along
this line one-dimensional motion. (Section 6.3 deals with two- and three-dimensional
motions.) Each mass is connected to its immediate neighbor(s) by a spring. Moreover, the
massspring chain may be connected at one or both ends by a spring to a solid support.
At first, for specificity, let us look at the case when both ends of the chain are attached,
3/7/03

191

c 2003

Peter J. Olver

m1

m2

m3

Figure 6.1.

A MassSpring Chain with Fixed Ends.

as illustrated in Figure 6.1. To be definite, we assume that the masses are arranged in a
vertical line, and order them from top to bottom. On occasion, we may refer to the top
support as mass m0 and the bottom support as mass mn+1 .
If we subject some or all of the masses to an external force, e.g., gravity, then the
system will move to a new equilibrium position. The motion of the ith mass is measured
by its displacement ui from its original position. Referring to Figure 6.1, we use the
convention that ui > 0 if the mass has moved downwards, and ui < 0 if it has moved
upwards. The problem is to determine the new equilibrium configuration of the chain
under the prescribed forcing, that is, to set up and solve a system of equations for the
displacements u1 , . . . , un .
Let ej denote the elongation of the j th spring, which connects mass mj1 to mass mj .
By elongation, we mean how far the spring has been stretched, so that e j > 0 if the spring
is longer than its reference length, while ej < 0 if the spring has been compressed. The
elongations can be determined directly from the displacements according to the geometric
formula
ej = uj uj1 ,
j = 2, . . . , n,
while
e1 = u1 ,

en+1 = un ,

since the top and bottom supports are fixed. We write the elongation equations in matrix
form
e = A u,
(6.1)

The differential equations governing its dynamical behavior during the motion will be the
subject of Chapter 8. Damping or frictional effects will cause the system to eventually settle down
into the new equilibrium configuration.

3/7/03

192

c 2003

Peter J. Olver

e
1
e2
where e =
..
.

is the elongation vector , u = .2 is the displacement vector , and


.

.
un
en+1
the coefficient matrix

1 1

1 1

1 1
(6.2)
A=

..
..

.
.

1 1
1

has size (n + 1) n, with only the non-zero entries being indicated. The matrix A is
known as the reduced incidence matrix for the massspring chain. It effectively encodes
the underlying geometry of the massspring chain, including the boundary conditions at
the top and the bottom. (The connection with the incidence matrix of a graph will become
evident in Section 6.2.)
The next step is to connect the elongation ej experienced by the j th spring to its internal force yj . This is the basic constitutive assumption, that relates geometry to kinematics.
In the present case, we shall assume that the springs are not stretched (or compressed)
particularly far, and so obey Hookes Law
yj = c j e j ,

(6.3)

named after the prolific seventeenth century English scientist and inventor Robert Hooke.
The constant cj > 0 measures the springs stiffness. Hookes Law says that force is
proportional to elongation the more you stretch a spring, the more internal force it
experiences. A hard spring will have a large stiffness and so takes a large force to stretch,
whereas a soft spring will have a small, but still positive, stiffness. We write (6.3) in matrix
form
y = C e,
(6.4)
where

y
1
y2
y=
..
.

yn+1

C=

c2
..

.
cn+1

Note particularly that C > 0 is a diagonal, positive definite matrix.


Finally, the forces must balance if the system is to remain in equilibrium. Let f i
denote the external force on the ith mass mi . We also measure force in the downwards
direction, so fi > 0 means the force is pulling the ith mass downwards. (In particular,
gravity would induce a positive force on each mass.) The ith mass is immediately below
the ith spring and above the (i + 1)st spring. If the ith spring is stretched, it will exert an
3/7/03

193

c 2003

Peter J. Olver

upwards force on mi , while if the (i + 1)st spring is stretched, it will pull mi downwards.
Therefore, the balance of forces on mi requires that
fi = yi yi+1 .

(6.5)

The matrix form of the force balance law is


f = AT y

(6.6)

where f = (f1 , . . . , fn )T . The remarkable fact is that the force

1 1

1 1

1 1
AT =

1 1

..
..

.
.
1

balance coefficient matrix

(6.7)

is the transpose of the reduced incidence matrix (6.2) for the chain. This connection between geometry and force balance turns out to be very general, and is the reason underlying
the positivity of the final coefficient matrix in resulting system of equilibrium equations.
Summarizing, we have
e = A u,

y = C e,

f = AT y.

(6.8)

K = AT C A

(6.9)

These equations combine into a single linear system


Ku = f,

where

is called the stiffness matrix associated with the entire massspring chain. The stiffness
matrix K has the form of a Gram matrix (3.48) for the weighted inner product h v ; w i =
vT C w induced by the diagonal matrix of spring stiffnesses. Theorem 3.33 tells us that
since A has linearly independent columns (which should be checked), and C > 0 is positive
definite, then the stiffness matrix K > 0 is automatically positive definite. In particular,
Theorem 3.38 guarantees that K is a regular, and hence invertible matrix, and hence the
linear system (6.9) has a unique solution u = K 1 f . We can therefore conclude that the
massspring chain has a unique equilibrium position.
In fact, in the case considered here

c1 + c 2
c2

c2
c2 + c 3
c3

c3
c3 + c 4
c4

c4
c4 + c 5 c 5

(6.10)
K=

..
..
..

.
.
.

c
c
+c
c
n1

n1

cn

cn + cn+1

has a very simple symmetric, tridiagonal form. As such, we can apply our tridiagonal
solution algorithm of Section 1.7 to rapidly solve the system.
3/7/03

194

c 2003

Peter J. Olver

Example 6.1. Let us consider the particular case of n = 3 masses connected by


identical springs with unit spring constant (in appropriate units) , so c 1 = c2 = c3 = c4 = 1
and C = diag(1, 1, 1, 1) = I is the 4 4 identity matrix. The 3 3 stiffness matrix is then

1 1
T

K=A A= 0 1
0 0

0
1
1

1
0
1
0
0
1
0

0
1
1
0


0
2
0
= 1
1
0
1

1
2
1

0
1 .
2

A straightforward Gaussian elimination produces the K = L D LT factorization


1
0 0
2 1 0
2 0 0
1 12
0
1 2 1 = 1
1 0 0 32 0 0 1 23 .
2
0 1 2
0 32 1
0 0 43
0 0
1

With this in hand, we can solve the basic equilibrium equations K u = f by our basic
forward and back substitution algorithm.
Suppose, for example, we pull the middle mass downwards with a unit force, so f 2 = 1
T
while f1 = f3 = 0. Then f = ( 0, 1, 0 ) , and the solution to the equilibrium equations (6.9)
T

is u = 12 , 1, 12 , whose entries prescribe the mass displacements. Observe that all three
masses have moved down, with the middle mass moving twice as far as the other two. The
corresponding spring elongations and internal forces are obtained by matrix multiplication
y = e = Au =

1
1
1
2, 2, 2, 2

Thus the top two springs are elongated, while the bottom two are compressed, all by an
equal amount.
Similarly, if all the masses are equal, m1 = m2 = m3 = m, then the solution under a
T
constant downwards gravitational force f = ( m g, m g, m g ) of magnitude g is

mg
2 mg
u = K 1 m g = 2 m g ,
3
mg
2 mg
and

y = e = Au =

1
1
3
2 m g, 2 m g, 2 m g, 2 m g

Now, the middle mass has moved 33% farther than the others, whereas the top and bottom
spring are experiencing three times as much elongation/compression.
An important observation is that we cannot determine the internal forces y or elongations e directly from the force balance law (6.6) because the transposed matrix A T is
not square, and so the system f = AT y does not have a unique solution. We must first
determine the displacements u using the full equilibrium equations (6.9), and then use the
resulting diplacements to reconstruct the elongations and internal forces. This situation is
referred to as statically indeterminate.
3/7/03

195

c 2003

Peter J. Olver

m1
m2
m3

Figure 6.2.

A MassSpring Chain with One Free End.

Remark : Even though we construct K = AT C A and then factor it as K = L D LT ,


there is no direct algorithm to get from A and C to L and D, which, typically, are matrices
of a different size.
The precise form of the system will depend upon the boundary conditions. Suppose,
by way of contrast, that we only fix the top of the chain to a support, and leave the bottom
mass hanging freely, as in Figure 6.2. The geometric relation between the displacements
and the elongations has the same form (6.1) as before, but the reduced incidence matrix
is slightly altered:

1
1 1

1 1

.
A=
(6.11)

1 1

..
..

.
.
1

This matrix has of size nn and is obtained from the preceding example (6.2) by eliminating the last row corresponding to the missing bottom spring. The constitutive equations
are still governed by Hookes law y = C e, as in (6.4), with C = diag(c 1 , . . . , cn ) the n n
diagonal matrix of spring stiffnesses. Finally, the force balance equations are also found
to have the same general form f = AT y as in (6.6), but with the transpose of the revised
incidence matrix (6.11). In conclusion, the equilibrium equations K x = f have an identical
form (6.9), based on the revised stiffness matrix

c1 + c 2
c2
c2

c2 + c 3
c3

c3
c3 + c 4
c4

c
c
+
c

c
4
4
5
5
T

K = A CA =
(6.12)

.
.
.

.
.
.
.
.
.

c
c
+ c c
n1

n1

cn

cn

Note that only the last entry has changed from the fixed end version (6.10).
3/7/03

196

c 2003

Peter J. Olver

Example 6.2. For a three mass chain with


constants c1 = c2 = c3 = 1. The stiffness matrix is

1 1 0
1
0
K = AT A = 0 1 1 1 1
0 0
1
0 1

one free end and equal unit spring



2
0
0 = 1
0
1

1 0
2 1 .
1 1
T

Pulling the middle mass downwards with a unit force, whereby f = ( 0, 1, 0 ) , results in
the displacements


1
1
1

u=K f = 2 ,
so that
y = e = A u = 1 .
2
0

In this configuration, the bottom two masses have moved equal amounts, and twice as far
as the top mass. Because we are only pulling on the middle mass, the lower-most spring
hangs free and experiences no elongation, whereas the top two springs are stretched by the
same amount.
Similarly, for a chain of equal masses subject to a constant downwards gravitational
T
force f = ( m g, m g, m g ) , the equilibrium position is


3mg
3mg
mg
and
y = e = A u = 2 m g .
u = K 1 m g = 5 m g ,
mg
6mg
mg

Note how much further the masses have moved now that the restraining influence of the
bottom support has been removed. The top spring is experiencing the most strain, and is
thus the most likely to break, because it must support all three masses.
In contrast to the chain with two fixed ends, this system is called statically determinate
because the incidence matrix A is square and nonsingular. This means that it is possible to
solve the force balance law (6.6) directly for the internal forces y = A 1 f without having
to solve the full equilibrium equations for the displacements.
The Minimization Principle
According to Theorem 4.2, when the coefficient matrix of the linear system governing a
massspring chain is positive definite, the unique equilibrium solution can be characterized
by a minimization principle. The quadratic function to be minimized has a physical interpretation: it is the potential energy of the system. Nature is parsimonious when it comes to
energy: physical systems seek out equilibrium configurations that minimize energy. This
general minimization principle can often be advantageously used in the construction of
mathematical models, as well as in their solution, both analytical and numerical.
The energy function to be minimized can be determined directly from physical principles. For a massspring chain, the potential energy of a mass equals the product of the
applied force times its displacement: fi ui . The minus sign is the result of our convention that a positive displacement ui > 0 means that the mass has moved down, and hence
3/7/03

197

c 2003

Peter J. Olver

decreased its potential energy. Thus, the total potential energy due to external forcing on
all the masses in the chain is

n
X

f i ui = u T f .

i=1

Next, we calculate the internal energy of the system. The potential energy in a single
spring elongated by an amount e is obtained by integrating the internal force, y = c e,
leading to
Z e
Z e
y de =
c e de = 12 c e2 .
0

Totalling the contributions from each spring, we find the internal spring energy to be
n
1 X
c e2 =
2 i=1 i i

1
2

eT C e =

1
2

uT AT CA u =

1
2

uT K u,

where we used the incidence equation e = A u relating elongation and displacement. Therefore, the total potential energy is
p(u) =

1
2

uT K u u T f .

(6.13)

Theorem 4.2 implies that the unique minimizer of this quadratic function satisfies the
equilibrium equation K u = f .
Example 6.3. For a three mass chain with two fixed ends described in Example 6.1,
the potential energy function (6.13) has the explicit form



2 1 0
u1
f1
1
T
T
p(u) = ( u1 u2 u3 ) 1 2 1 u2 ( u1 u2 u3 ) f2
2
0 1 2
u3
f3
= u21 u1 u2 + u22 u2 u3 + u23 u1 f1 u2 f2 u3 f3 ,
T

where f = ( f1 , f2 , f3 ) is the external forcing. The minimizer of this particular quadratic


T
function gives the equilibrium displacements u = ( u1 , u2 , u3 ) of the three masses.

6.2. Electrical Networks.


An electrical network consists of wires that are joined together at their ends. A
junction where one or more wires are connected is called a node. Abstractly, we can view
any such network as a graph, the wires being the edges of the graph and the nodes the
vertices. To begin with we assume that the only electrical devices (batteries, inductors,
capacitors, etc.) in the network are resistors on the wires. Let Rj denote the resistance in
the j th wire. As we shall see, resistance (or, rather, its reciprocal) plays a very similar role
to spring stiffness.
In such a network, the basic equilibrium equations for electrical current are the consequence of three fundamental laws of electricity.
3/7/03

198

c 2003

Peter J. Olver

u1
R1

R2

u2

R3

u3

R5

R4
u4

Figure 6.3.

Simple Electrical Network.

Kirchhoff s Voltage Law : Voltage is defined as the electromotive force that moves
electrons through a wire. The voltage in a wire is induced by the difference in the voltage
potentials at the two ends, just as the gravitational force on a mass is induced by a
difference in gravitational potential. To quantify voltage, we need to assign an orientation
to the wire, in which case a positive voltage means the electrons move in the assigned
direction, while under a negative voltage they move in reverse. The original choice of
orientation is arbitrary, but once assigned will pin down the sign conventions used by
voltages, currents, etc. As a consequence, we use a digraph to represent the network,
in which each edge or wire is assigned a direction that indicates its starting and ending
vertices or nodes. A simple example is illustrated in Figure 6.3, and contains five wires
joined at four different nodes. The arrows indicate the orientations of the wires, while the
wavy lines are the electrical symbol for resistance.
In an electrical network, each node will have a voltage potential, denoted u i . If wire k
connects node i and node j, then its voltage vk equals the potential difference at its ends:
vk = ui uj ,

(6.14)

where i and j denote the starting and ending nodes of wire k. Note that v k > 0 if ui > uj ,
and so the electrons go from the starting node i to the ending node j, in accordance with
our choice of orientation. In our particular example, we have
v1 = u1 u2 ,

v 2 = u1 u3 ,

v 3 = u1 u4 ,

v 4 = u2 u4 ,

v 5 = u3 u4 .

Let us rewrite this system in matrix form


v = A u,
3/7/03

199

(6.15)
c 2003

Peter J. Olver

where, for our particular example,


1 1
1 0

A = 1 0

0 1
0 0

0
1
0
0
1

0
0

1 .

1
1

(6.16)

The alert reader will recognize this matrix as the incidence matrix (2.38) for the digraph
defined by the circuit; see (2.38). This is true in general the voltages along the wires of
an electrical network are related to the potentials at the nodes by the linear system (6.15),
where A is the incidence matrix of the network digraph. The rows of the incidence matrix
are indexed by the wires; the columns are indexed by the nodes. Each row of the matrix
A has a single + 1 in the column indexed by the starting node, and a single 1 in the
column of the ending node.
Kirchhoff s Voltage Law states that the sum of the voltages around each closed loop in
the network is zero. For example, in the circuit under consideration, around the left-hand
triangle we have
v1 + v4 v3 = (u1 u2 ) + (u2 u4 ) (u1 u4 ) = 0.
Note that v3 appears with a minus sign since we must traverse wire #3 in the opposite
direction to its assigned orientation when going around the loop in the counterclockwise
direction. The voltage law is a direct consequence of (6.15). Indeed, as discussed in
Section 2.6, the loops can be identified with vectors ` coker A = ker A T in the cokernel
of the incidence matrix, and so
` v = `T v = `T A u = 0.
Therefore, orthogonality of the voltage vector v to the loop vector ` is the mathematical
formulation of the zero-loop relation.
Given a prescribed set of voltages v along the wires, can one find corresponding
voltage potentials u at the nodes? To answer this question, we need to solve v = A u,
which requires v rng A. According to the Fredholm Alternative Theorem 5.45, the
necessary and sufficient condition for this to hold is that v be orthogonal to coker A.
Theorem 2.51 says that the cokernel of an incidence matrix is spanned by the loop vectors,
and so v is a possible set of voltages if and only if v is orthogonal to all the loop vectors
` coker A, i.e., the Voltage Law is necessary and sufficient for the given voltages to be
physically realizable in the network.
Ohms Law : Kirchhoffs Laws are related to the topology of the circuit how the
different wires are connected together. Ohms Law is a constitutive relation, indicating
what the wires are made of. The resistance along a wire, including any added resistors,
prescribes the relation between voltage and current or the rate of flow of electric charge.
The law reads
v k = R k yk ,
(6.17)
3/7/03

200

c 2003

Peter J. Olver

where vk is the voltage and yk (often denoted Ik in the engineering literature) denotes the
current along wire k. Thus, for a fixed voltage, the larger the resistance of the wire, the
smaller the current that flows through it. The direction of the current is also prescribed
by our choice of orientation of the wire, so that yk > 0 if the current is flowing from the
starting to the ending node. We combine the individual equations (6.17) into a matrix
form
v = R y,
(6.18)
where the resistance matrix R = diag(R1 , . . . , Rn ) > 0 is diagonal and positive definite.
We shall, in analogy with (6.4), replace (6.18) by the inverse relationship
y = C v,

(6.19)

where C = R1 is the conductance matrix , again diagonal, positive definite, whose entries
are the conductances ck = 1/Rk of the wires. For the particular circuit in Figure 6.3,

C=

c1

c2
c3
c4
c5

1/R1

1/R2

1/R3
1/R4
1/R5

(6.20)

Kirchhoff s Current Law : Finally, we stipulate that electric current is not allowed to
accumulate at any node, i.e., every electron that arrives at a node must leave along one of
the wires. Let yk , yl , . . . , ym denote the currents along all the wires k, l, . . . , m that meet
at node i in the network. We allow for an external current source of magnitude f i to send
electricity into the network through the ith node. The Current Law requires that the net
current into the node, namely
yk yl ym fi = 0,

(6.21)

must be zero. Each sign is determined by the orientation of the wire, with + if node i
is a starting node or if it is an ending node.
In our particular example, suppose that we send a 1 amp current source into the first
node. Then Kirchhoffs Current Law requires
y1 + y2 + y3 = 1,

y1 + y4 = 0,

y2 + y5 = 0,

y3 y4 y5 = 0.

The matrix form of this system is


AT y = f ,

(6.22)

where y = ( y1 , y2 , y3 , y4 , y5 ) are the currents along the five wires, and f = ( 1, 0, 0, 0 )


represents the current sources at the four nodes. The coefficient matrix

1
1
1
0
0
0
1
0
1 0
AT =
(6.23)
,
0 1 0
0
1
0
0 1 1 1
3/7/03

201

c 2003

Peter J. Olver

is the transpose of the incidence matrix (6.16). As in the massspring chain, this is a
general fact, and is an immediate result of Kirchhoffs two laws. The coefficient matrix for
the current law is the transpose of the incidence coefficient matrix for the voltage law.
Let us assemble the full system of equilibrium equations:
v = A u,

f = AT y.

y = C v,

(6.24)

Remarkably, we arrive at a system of linear relations that has an identical form to the
massspring chain system (6.8). As before, they combine into a single linear system
Ku = f,

where

K = AT C A

(6.25)

is the resistivity matrix associated with the given network. In our particular example,
combining (6.16), (6.20), (6.23) produces the resistivity matrix

c1 + c 2 + c 3
c1
c2
c3
c1
c1 + c 4
0
c4

K = AT C A =
(6.26)

c2
0
c2 + c5
c5
c3
c4
c5
c3 + c 4 + c 5
depending on the conductances of the five wires in the network.

Remark : There is a simple pattern to the resistivity matrix, evident in (6.26). The
diagonal entries kii equal the sum of the conductances of all the wires having node i at
one end. The non-zero off-diagonal entries kij , i 6= j, equal ck , the conductance of the
wire joining node i to node j, while kij = 0 if there is no wire joining the two nodes.
Consider the case when all the wires in our network have equal unit resistance, and
so ck = 1/Rk = 1 for k = 1, . . . , 5. Then the resistivity matrix is
3 1 1
0
1 2
K=
1 0
2
1 1 1

1
1
.
1
3

(6.27)

However, trying to solve the system (6.25) runs into an immediate difficulty: there is no
solution! The matrix (6.27) is not positive definite it has zero determinant, and so is
T
not invertible. Moreover, the particular current source vector f = ( 1, 0, 0, 0 ) does not lie
in the range of K. Something is clearly amiss.
Before getting discouraged, let us sit back and use a little physical intuition. We are
trying to put a 1 amp current into the network at node 1. Where can the electrons go? The
answer is nowhere they are trapped and something drastic will happen sparks will fly!
This is clearly an unstable situation, and so the fact that the equilibrium equations do not
have a solution is trying to tell us that the physical system cannot remain in equilibrium.
The physics rescues the math, or, vice versa, the math elucidates the physics.

This assumes that there is only one wire joining the two nodes.

3/7/03

202

c 2003

Peter J. Olver

In order to have a steady state in an electrical network, we must remove as much


current as we put in. In other words, the sum of all the current sources,
f1 + f2 + + fn = 0,

must vanish. For example, if we feed a 1 amp current into node 1, then we must extract
a total of 1 amps worth of current from the other nodes. If we extract a 1 amp current
T
from node 4, the modified current source vector f = ( 1, 0, 0, 1 ) lies in the range of K
and the equilibrium system (6.25) has a solution. Fine . . .
But we are not out of the woods yet. As we know, if a linear system has a singular
square coefficient matrix, then either it has no solutions the case we already rejected
or it has infinitely many solutions the case we are considering now. In the particular
network under consideration, the general solution to the linear system

3 1 1 1
u1
1
0 1 u2 0
1 2

1 0
2 1
u3
0
1 1 1 3
u4
1
is found by Gaussian elimination:
1

u =

2
1
4
1
4



+ t

1
= + t ,

1
+t
1
t
0
+t

2
1
4
1
4

(6.28)

where t = u4 is the free variable. The nodal voltage potentials


u1 =

1
2

+ t,

u2 =

1
4

+ t,

u3 =

1
4

+ t,

u4 = t,

depend on a free parameter t.


The ambiguity arises because we have not specified a baseline value for the voltage
potentials. Indeed, voltage potential is a mathematical abstraction that cannot be measured directly; only relative potential differences have physical import. To eliminate the
ambiguity, one needs to assign a base potential level. (A similar ambiguity arises in the
specification of gravitational potential.) In terrestrial electricity, the Earth is assumed
to be at a zero voltage potential. Specifying a particular node to have zero potential is
physically equivalent to grounding that node. Grounding one of the nodes, e.g., setting
u4 = t = 0, will then uniquely specify the all other voltage potentials, resulting in a unique
solution u1 = 21 , u2 = 41 , u3 = 41 , u4 = 0, to the system.
On the other hand, even without specification of a baseline potential level, the corresponding voltages and currents along the wires are uniquely specified. In our example,
computing y = v = A u gives
y1 = v 1 =

1
4

y2 = v 2 =

1
4

y3 = v 3 =

1
2

y4 = v 4 =

1
4

y5 = v 5 =

1
4

independent of the value of t in (6.28). Thus, the nonuniqueness of the voltage potential
solution u is not an essential difficulty. All physical quantities that we can measure
currents and voltages are uniquely specified by the solution to the equilibrium system.
3/7/03

203

c 2003

Peter J. Olver

Remark : Although they have no real physical meaning, we cannot dispense with the
nonmeasurable (and nonunique) voltage potentials u. Most circuits are statically indeterminate since their incidence matrix is rectangular and not invertible, and so the linear
system AT y = f cannot be solved directly for the currents in terms of the voltage sources
it does not have a unique solution. Only by first solving the full equilibrium system
(6.25) for the potentials, and then using the relation y = CA u between the potentials and
the currents, can we determine the actual values of the currents in our network.
Let us analyze what is going on in the context of our general mathematical framework.
Proposition 3.32 says that the resistivity matrix K = AT CA is positive definite (and
hence nonsingular) provided A has linearly independent columns, or, equivalently, ker A =
{0}. But Proposition 2.49 says than the incidence matrix A of a directed graph never
has a trivial kernel. Therefore, the resistivity matrix K is only positive semi-definite,
and hence singular. If the network is connected, then ker A = ker K = coker K is oneT
dimensional, spanned by the vector z = ( 1, 1, 1, . . . , 1 ) . According to the Fredholm
Alternative Theorem 5.45, the fundamental network equation K u = f has a solution if
and only if f is orthogonal to coker K = ker K = ker A, and so the current source vector
must satisfy
f z = f1 + f2 + + fn = 0,
(6.29)
as we already observed. Therefore, the linear algebra reconfirms our physical intuition: a
connected network admits an equilibrium configuration, obtained by solving (6.25), if and
only if the nodal current sources add up to zero, i.e., there is no net influx of current into
the network.
Grounding one of the nodes is equivalent to specifying the value u i = 0 of the voltage
potential there. This variable is now fixed, and can be safely eliminated from our system.
To accomplish this, we let A? denote the m (n 1) matrix obtained by deleting the ith
column from A. For example, if we ground node number 4 in our sample network, then
we erase the fourth column of the incidence matrix (6.16), leading to the reduced incidence
matrix

1 1 0
1 0 1

?
A = 1 0
0 .
(6.30)

0 1
0
0 0
1
The key observation is that A? has trivial kernel, ker A? = {0}, and therefore the reduced
network resistivity matrix

c1 + c 2 + c 3
c1
c2
c1
c1 + c 4
0 .
K ? = (A? )T CA? =
(6.31)
c2
0
c2 + c5
is positive definite. Note that we can obtain K ? directly from K by deleting its fourth
T
row and column. Let f ? = ( 1 0 0 ) denote the reduced current source vector obtained
by deleting the fourth entry from f . Then the reduced linear system is
K ? u? = f ? ,
3/7/03

where
204

u ? = ( u1 , u2 , u3 ) ,
c 2003

(6.32)
Peter J. Olver

is the reduced voltage potential vector, obtained by deleting the fourth entry of u. Positive definiteness of K ? implies that (6.32) has a unique solution u? , from which we can
reconstruct the voltages v = A? u? and currents y = C v = CA? u? along the wires. In
our example, if all the wires have unit resistance, then the reduced system (6.32) is


1
3 1 1
u1

1 2

0
u2 = 0 ,
0
1 0
2
u3

T
and has unique solution u? = 21 14 41 . The voltage potentials are
u1 = 12 ,

u2 = 41 ,

u3 = 14 ,

u4 = 0,

and correspond to the earlier solution with t = 0. The corresponding voltages and currents
along the wires are the same as before.
So far, we have only considered the effect of current sources at the nodes. Suppose
now that the circuit contains one or more batteries. Each battery serves as a voltage source
along one of the wires, and we let bk denote the voltage of a battery connected to wire k.
The quantity bk comes with a sign, indicated by the positive and negative terminals on the
battery. Our convention is that bk > 0 if the current from the battery runs in the same
direction as our chosen orientation of the wire. The battery voltage modifies the voltage
balance equation (6.14):
vk = ui uj + bk .
The corresponding matrix form (6.15) becomes
v = A u + b,

(6.33)

where b = ( b1 , b2 , . . . , bm ) is the battery vector whose entries are indexed by the wires.
(If there is no battery on wire k, the corresponding entry is bk = 0.) The remaining two
equations are as before, so y = C v are the currents in the wires, and, in the absence of
external current sources, Krichhoffs Current Law implies AT y = 0. Using the modified
formula (6.33) for the voltages, these combine into the following equilibrium system
K u = AT C b.

(6.34)

Therefore, the batteries have exactly the same effect on the circuit as if we imposed the
current source vector
f = AT C b.
(6.35)
Namely, the effect of the battery of voltage bk on wire k is the exactly the same as introducing a additional current sources of ck bk at the starting node and ck bk at the ending
node. Note that the induced current vector f rng K continues to satisfy the network
constraint (6.29). Vice versa, a given system of current sources f has the same effect as
any collection of batteries b that satisfies (6.35).
As before, to eliminate the ambiguity in the solution u, we can ground one of the nodes
and use the reduced incidence matrix A? and reduced current source vector f ? obtained
by eliminating the column/entry corresponding to the grounded node.
3/7/03

205

c 2003

Peter J. Olver

Example 6.4. Consider an electrical network running along the sides of a cube,
where each wire contains a 2 ohm resistor and there is a 9 volt battery source on one wire.
The problem is to determine how much current flows through the wire directly opposite the
battery. Orienting the wires and numbering them as indicated in Figure 6.3, the incidence
matrix is

1 1 0
0
0
0
0
0
0
0
0
0
1 0 1 0

0 1 0
0
0
0
1 0

0
0 1 0
0
0
0 1

0
0
0 1 0
0
0 1

1
0 1 0
0
0
0 0
A=
.
1
0
0
0 1 0
0 0

0
1
0 1 0
0
0 0

0
1
0
0 1 0
0 0

0
0
1
0
0 1
0 0

0 0
0
0
0
1
0 1
0 0
0
0
0
0
1 1
We connect the battery along wire #1 and measure the resulting current along wire #12.
To avoid the ambiguity in the voltage potentials, we ground the last node and erase the
final column from A to obtain the reduced incidence matrix A? . Since the resistance
matrix R has all 2s along the diagonal, the conductance matrix is C = 21 I . Therefore the
network resistivity matrix is

3 1 1 1 0
0
0
0
0 1 1 0
1 3

3
0 1 0 1
1 0
1

K ? = (A? )T CA? = 12 (A? )T A? = 1 0


0
3
0 1 1 .

2
3
0
0
0 1 1 0

0 1 0 1 0
3
0
0
0 1 1 0
0
3
T

The current source corresponding to the battery b = ( 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) along


the first wire is
T
f ? = (A? )T C b = 12 ( 9, 9, 0, 0, 0, 0, 0 ) .
Solving the resulting linear system by Gaussian elimination, the voltage potentials are

T
u? = K 1 f ? = 3, 94 , 98 , 98 , 38 , 38 , 43
.

Thus, the induced currents along the sides of the cube are

15 15
15
15 3 3 3 3
3
3 3 T
.
y = CA? u? = 21
8 , 16 , 16 , 16 , 16 , 4 , 16 , 2 , 16 , 16 , 16 , 8

In particular, the current on the wire that is opposite the battery is y12 = 38 .
The Minimization Principle and the ElectricalMechanical Analogy
As with our massspring chain, there is a corresponding minimization principle which
governs the current flows in such a resistive network. The principle is based on fundamental
3/7/03

206

c 2003

Peter J. Olver

R 12

u7
R7

u8

R 10

R9

u3
R6
u4

R2

R4

R8
u6

R3
u1

Figure 6.4.

R 11

u5

R5
R1

u2

Cubical Electrical Network with a Battery.

physical laws governing energy in the electrical network. The power in an electrical circuit
is defined as the rate of change of energy. Given a current y flowing through a wire of
resistance R, the power is
P = R y2 = c v2 ,
(6.36)
where v = R y is the voltage and c = 1/R the conductance. Physically, the power tells us
the rate at which electrical energy is coverted into heat by the resistance along the wire,
or into other types of energy by light bulbs, speakers, etc. Summing over all the wires in
the network, the total power is the matrix product
m
X

ck vk2 = vT C b = uT AT C A u = uT K u.

k=1

If a battery with voltage b is added into a wire, the effect on the power is doubled,
since
P = c (v + b)2 = c v 2 + 2 c v b + c b2 ,
and hence in matrix form
m
X

ck (vk + bk )2 = vT Cv + 2 vT C b + bT C b.

k=1

The last term is the internal power in the battery, and does not depend upon the currents/voltages in the wires. As a result, we ignore it and concentrate on the first and
second terms. Now, according to (6.35), the batteries b have the same effect as putting
current sources f = AT C b at the nodes. Therefore, the middle term can be written as
2 v T C b = 2 u T AT C b = 2 u T f .
By an analogous argument, any current sources at the nodes contribute to the power in
the same manner.
3/7/03

207

c 2003

Peter J. Olver

As a result, the total power in a circuit with current sources f is equal to twice the
quadratic function
p(u) = 21 uT Ku uT f .
(If we have batteries, we replace f by its modification from (6.34).) Grounding one of the
nodes leads to the reduced form
p(u? ) =

1
2

(u? )T K ? u? (u? )T f ? ,

for the power, now with positive definite coefficient matrix K ? > 0. The minimizer of
the power is the solution to the linear system. Therefore, the network adjusts itself to
minimize the power or total energy loss! Just as with mechanical systems, nature solves a
minimization problem in an effort to conserve energy.
We have discovered the remarkable similarity between the equilibrium equations for
electrical networks (6.8), and those of massspring chains (6.24). The fundamental Electrical
Mechanical Analogy is summarized in the following table. In the mechanical side, we use
v instead of e to represent elongation.

Structures
Displacements
Elongations
Spring stiffnesses
Internal Forces
External forcing
Stiffness matrix
Prestressed bars/springs
Potential energy

Variables

Networks

u
v = Au
C
y=Cv
f = AT y
K = AT C A
v = Au + b
p(u) = 21 uT Ku uT f

Voltages
Voltage drops
Conductivities
Currents
Current sources
Resistivity matrix
Batteries
1
2 Power

In the following section, we will see that the analogy immediately extends to more
general structures.

6.3. Structures in Equilibrium.


A structure (also known as a truss) is a mathematical idealization of a framework
for a building. Think of a skyscraper when just the I-beams are connected together
before the walls, floors, ceilings, roof and ornamentation are added. An ideal structure
is constructed of elastic bars connected at joints. By a bar , we mean a straight, rigid
rod that can be (slightly) elongated, but not bent. (Beams, which are allowed to bend,

In an alternating circuit, the factor of 2 disappears, and the analogy is more direct.

3/7/03

208

c 2003

Peter J. Olver

are more complicated, requiring differential equations. We defer their treatment until
Section 10.4.) When a bar is stretched, it obeys Hookes law (at least in the linear regime
we are currently working in) and so, for all practical purposes, behaves like a spring with a
very large stiffness. As a result, a structure can be viewed as a two- or three-dimensional
generalization of a massspring chain.
The joints will allow the bar to rotate in any direction. Of course, this is an idealization; in a building, the rivets and bolts will prevent rotation to a significant degree. However, under moderate stess for example, if the wind is blowing through our skyscraper,
the rivets and bolts can only be expected to keep the structure connected, and the rotational motion will provide stresses on the bolts which must be taken into account when
designing the structure. Of course, under extreme stress, the structure will fall apart
a disaster that its designers must avoid. The purpose of this section is to derive conditions that will guarantee that a structure is rigidly stable under moderate forcing, or,
alternatively, determine the mechanisms that might lead to its collapse.
Bars
The first order of business is to understand how an individual bar reacts to motion. We
have already encountered the basic idea in our treatment of springs. The key complication
here is that the ends of the bar/spring are not restricted to a single direction of motion,
but can move in either two or three-dimensional space. We use d = 2 or 3 denote the
dimension of the underlying space. (The case d = 1 reduces to a massspring chain.)
Consider an unstressed bar of length L, with one end at position a 1 R d and the
T
other end at position a2 R d . In two dimensions, we write ai = ( ai , bi ) , while in threeT
dimensional space ai = ( ai , bi , ci ) . The length of the bar is L = k a1 a2 k, where we
use the standard Euclidean norm on R d throughout this section.
Suppose we move the ends of the bar a little, displacing ai to ai + ui and simultaneously displacing aj to aj + uj . The vectors ui , uj R d indicate the direction of
displacement of the two ends, and we think of , the magnitude of the displacement, as
small. How much has this motion stretched the bar? The new length of the bar is
L + e = k (ai + ui ) (aj + uj ) k = k (ai aj ) + (ui uj ) k
q
= k ai aj k2 + 2 (ai aj ) (ui uj ) + 2 k ui uj k2 .

The difference between the new length and the original length, namely
q
e =
k ai aj k2 + 2 (ai aj ) (ui uj ) + 2 k ui uj k2 k ai aj k,

(6.37)

(6.38)

is, by definition, the elongation of the bar.


If the underlying dimension d is bigger than 1, the elongation e is a nonlinear function
of the displacement vectors ui , uj . Thus, an exact, geometrical treatment of structures in
equilibrium requires dealing with nonlinear systems of equations. However, in practical
situations, the displacements are fairly small, so 1. For example, in the motion
of a building, the lengths of bars are in meters, but the displacements are, typically in
3/7/03

209

c 2003

Peter J. Olver

Figure 6.5.

Tangent Line Approximation.

centimeters if not millimeters. In such situations, we can approximate the elongation by a


much simpler linear function.
In order to linearize, we shall approximate a nonlinear function g() for small values
of 1 by its tangent line or linear Taylor polynomial
g() g(0) + g 0 (0)

(6.39)

at = 0, cf. Figure 6.5. Here we are concerned with square root functions of the form
p
g() = a + b + 2 c ,

where a, b, c are constants. We first compute g(0) = a and g 0 (0) = b/(2 a). Substituting
these values into (6.39) leads to the required linear approximation
p

b
a + b + 2 c a + .
2 a

Using this approximation in the elongation formula (6.38), we find that the constant terms
cancel, and we arrive at the linear approximation
e

(ai aj ) (ui uj )
= n (ui uj ),
k ai aj k

for the elongation of the displaced bar. The vector n = (ai aj )/k ai aj k is the unit
vector, k n k = 1, in the direction of the bar that goes from node j to node i.
The overall small factor of was merely a device used to derive the linear approximation. It can now be safely discarded, so that the displacement of the i th node is now ui
instead of ui , and we assume k ui k is small. If bar k connects node i to node j, then its
(approximate) elongation is equal to
ek = nk (ui uj ) = nk ui nk uj ,

where

nk =

ai a j
.
k ai aj k

(6.40)

The elongation ek is the sum of two terms: the first, nk ui , is the component of the
displacement vector for node i in the direction of the unit vector nk that points along the
bar towards node i, whereas the second, nk uj , is the component of the displacement
vector for node j in the direction of the unit vector nk that points in the opposite
direction along the bar towards node j. The combination is the total elongaiton.
3/7/03

210

c 2003

Peter J. Olver

Figure 6.6.

Three Bar Planar Structure.

We assemble all the linear equations (6.40) relating nodal displacements to bar elongations in matrix form
e = A u.
(6.41)
T

Here e = ( e1 , e2 , . . . , em ) R m is the vector of elongations, whose entries are indexed


u
1
u2
dn
d

by the bars, while u =


.. R is the vector of displacements. Each ui R is itself
.

un
a column vector with d entries, and
u has a total of d n entries. For example, in the
so
xi
since each node has both an x and y component
planar case d = 2, we have ui =
yi
to its displacement, and so
x
1

y1
u

1
x2

u2
2n
y

u =
.. = .2 R .
.
.
.
un

xn
yn
T

In three dimensions, d = 3, we have ui = ( xi , yi , zi ) , and so each node will contribute


three components to the displacement vector
u = ( x 1 , y1 , z1 , x 2 , y2 , z2 , . . . , x n , yn , zn )

R3 n.

The incidence matrix A connecting the displacements and elongations will be of size
m (d n). The k th row of A will have (at most) 2 d nonzero entries. The entries in the d
slots corresponding to node i will be the components of the (transposed) unit bar vector n Tk
pointing towards node i, as given in (6.40), while the entries in the d slots corresponding
to node j will be the components its negative nTk , which is the unit bar vector pointing
towards node j. All other entries are 0.
Example 6.5. Consider the planar structure pictured in Figure 6.6. The four nodes
3/7/03

211

c 2003

Peter J. Olver

are at positions
a1 = (0, 0)T ,

a2 = (1, 1)T ,

a3 = (3, 1)T ,

a4 = (4, 0)T ,

so the two side bars are at 45 angles and the center bar is horizontal. Applying our
algorithm, the associated incidence matrix is

12 12 12 12
0
0
0 0

(6.42)
A= 0
0 .
1
0 0
0 1 0
1
1
1
1
0
0 0
0 2 2 2 2

The three rows of A refer to the three bars in our structure. The columns come in pairs,
as indicated by the vertical lines in the matrix: the first two columns refer to the x and
y displacements of the first node; the third and fourth columns refer to the second node,
and so on. The first two entries of the first row of A indicate the unit vector

T
a1 a 2
= 12 , 12
n1 =
k a1 a2 k
that points along the first bar towards the first node, while the third and fourth entries
have the opposite signs, and constitute the unit vector

T
a2 a 1
n1 =
= 12 , 12
k a2 a1 k

along the same bar that points towards the second node. The remaining entries are zero
because the first bar connects the first two nodes. Similarly, the unit vector along the
second bar pointing towards node 2 is
n2 =

a2 a 3
T
= ( 1, 0 ) ,
k a2 a3 k

and this gives the third and fourth entries of the second row of A. And so on.
Remark : Interestingly, the incidence matrix for a structure only depends on the directions of the bars and not their lengths. This is analogous to the fact that the incidence
matrix for an electrical network only depends on the connectivity properties of the wires
and not on their overall lengths. Indeed, one can regard the incidence matrix for a structure
as a kind of ddimensional generalization of the incidence matrix for a directed graph.
The next phase of our procedure is to apply the constitutive relations to determine the
internal forces (stresses) within the bars. As we remarked at the beginning of the section,
each bar is viewed as a very strong spring, subject to a linear Hookes law equation
yk = c k e k

(6.43)

that relates its elongation ek to its internal force yk . The bar stiffness ck > 0 is a positive
scalar, and so yk > 0 if the bar is in tension, while yk < 0 is the bar is compressed. We
write (6.43) in matrix form
y = C e,
3/7/03

212

c 2003

Peter J. Olver

where C = diag(c1 , . . . , cm ) > 0 is a diagonal, positive definite matrix. In this approximation, there is no bending and the bars will only experience external forcing at the
nodes.
Finally, we need to balance the forces at each node in order to achieve equilibrium. If
bar k terminates at node i, then it exerts a force yk nk on the node, where nk is the unit
vector pointing towards the node in the direction of the bar, as in (6.40). The minus sign
comes from physics: if the bar is under tension, so yk > 0, then it is trying to contract back
to its unstressed state, and so will pull the node towards it in the opposite direction to
nk while a bar in compression will push the node away. In addition, we may have an
externally applied force vector, denoted by f i , on node i, which might be some combination
of gravity, weights, mechanical forces, and so on. Force balance at equilibrium requires
that the sum of all the forces, external and internal, at each node cancel; thus,
X
X
fi +
( yk nk ) = 0,
or
yk n k = f i ,
k

where the sum is over all the bars that are attached to node i. The matrix form of the
force balance equations is (and this should no longer come as a surprise)
f = AT y,

(6.44)
T

where AT is the transpose of the incidence matrix, and f = f n1 , f n2 , . . . , f n R d n is


the vector containing all external force on the nodes. Putting everything together,

e = A u,

y = C e,

f = AT y,

we once again are lead to our standard linear system of equations


Ku = f,

where

K = AT C A.

(6.45)

The stiffness matrix K is a positive (semi-)definite Gram matrix (3.48) associated with a
weighted inner product on the space of elongations.
As we know, the stiffness matrix K for our structure will be positive definite if and
only if the incidence matrix has trivial kernel: ker A = {0}. The preceding example, and
indeed all of these constructed so far, will not have this property, basically for the same
reason as in an electrical network because we have not tied down (or grounded) our
structure anywhere. In essence, we are considering a structure floating in outer space,
which free to move without changing its shape. Each possible rigic motion of the structure
will produce an element of the kernel of its incidence matrix.
Example 6.6. Consider a planar space station in the shape of a unit equilateral
triangle, as in Figure 6.7. Putting the nodes at

T
T
T
a1 = 21 , 23
,
a2 = ( 1, 0 ) ,
a3 = ( 0, 0 ) ,

we use the preceding algorithm to compute the incidence matrix



1
3
1 3
0
0
2
2
2

2

3
3 1
A = 12

0
0 .
2
2 2
0 1
0
0
0 1
3/7/03

213

c 2003

Peter J. Olver

Figure 6.7.

A Triangular Structure.

The kernel of A is three-dimensional, with basis


0

1

0

z2 =
,
1

0


1

0

1

z1 =
,
0

1

z3 =

3
2

1
2

0
1
0
0

(6.46)

These three displacement vectors correspond to three different planar rigid motions the
first two correspond to translations, and the third to a rotation.
The translations are easy to discern. Translating the space station in a horizontal
direction means that we move all three nodes the same amount, and so the displacements
T
are u1 = u2 = u3 = ( r, 0 ) where r is the distance translated. The full displacement
T
vector u = ( r, 0, r, 0, r, 0 ) = r z1 is a multiple of the first kernel basis vector. Similarly,
rigid vertical translations of all three nodes corresponds to a multiple, r z 2 , of the second
kernel basis vector. Any other translation is a linear combination of these two types.
Translations do not alter the lengths of any of the bars, and so do not induce any stress
in the structure.
The rotations are a little more subtle, owing to the linear approximation that we used
to compute the elongations. Referring to Figure 6.8, rotating the space station through
T
an angle t around the last node a3 = ( 0, 0 ) will move the other two nodes to

b1 =
3/7/03

1
2
1
2

cos t
sin t +

3
sin t
2
3
2 cos t

b2 =
214

!
cos t
,
sin t

!
0
b3 =
.
0
c 2003

(6.47)

Peter J. Olver

Rotating a Space Station.

Figure 6.8.

However, the corresponding displacements

3
1
(cos
t

1)

sin
t
2
2

,
1
3
sin
t
+
(cos
t

1)
2
2

u1 = b 1 a 1 =

u2 = b 2 a 2 =

cos t 1
,
sin t

!
0
u3 = b3 a3 =
,
0

(6.48)

do not combine into one of the vectors in ker A. The problem is that, under a rotation,
the nodes move along circles, while the kernel displacements u = t z ker A require
the nodes to move along straight lines! In order to maintain consistency with our linear
approximation, we must approximate the nonlinear circular motion of the nodes by a linear
straight line motion. In other words, we should use the linear tangent line approximations
(6.39) to the circles. Thus, for the rotation (6.47), we replace the nonlinear displacements
uj (t) in (6.48) by their linear tangent approximations t u0j (0), and so
u1 t

3
2
1
2

u2

!
0
t
,
1

!
0
u3 =
.
0

The resulting displacements do combine to produce the displacement vector



u = t 23

1
2

0 t 0 0

= t z3

that moves the space station in the direction of the third element of the kernel of the
incidence matrix! Thus, as claimed, z3 represents the linear approximation to a rigid
rotation around the first node.

Note that uj (0) = 0.

3/7/03

215

c 2003

Peter J. Olver

Remarkably, the rotations around the other two nodes, although distinct nonlinear
motions, can be linearly approximated by particular combinations of the three kernel basis
elements, and so already appear in our description of ker A. For example, the displacement
vector

T

3
3
3
1
1
1
u = t 2 z1 + 2 z2 z 3 = 0 0 2 t 2 t 2 t 2 t
(6.49)
represents the linear approximation to a rigid rotation around the first node. See Exercise
for details. We conclude that the three-dimensional kernel of the incidence matrix represents the sum total of all possible rigid motions of the space station, or, more correctly,
their linear approximations.
Which types of forces will maintain the space station in equilibrium? This will happen
if and only if we can solve the force balance equations
AT y = f

(6.50)

for the internal forces y. The Fredholm Alternative Theorem 5.45 implies that the system
T
(6.50) has a solution if and only if f = ( f1 g1 f2 g2 f3 g3 ) lies in the corange of A, and
is hence orthogonal to the kernel:
f rng AT = corng A = (ker A) .
Therefore, f must be orthogonal to the basis vectors (6.46), and so must satisfy the three
constraints
z1 f = f1 + f2 + f3 = 0,

z2 f = g1 + g2 + g3 = 0,

z3 f =

3
2

f1 +

1
2

g1 + g3 = 0.

The first constraint requires that there is no net horizontal force on the space station. The
second requires no net vertical force. The last constraint requires that the moment of the
forces around the first node vanishes. Incidentally, these three equations imply that the
force moments around each of the other two nodes also vanish, since the associated kernel
vectors can be expressed as linear combinations of the three basis vectors. The physical
requirements are clear. If there is a net horizontal or vertical force, the space station will
rigidly translate in that direction; if there is a non-zero force moment, the station will
rigidly rotate. In any event, unless the force balance equations are satisfied, the space
station cannot remain in equilibrium. A freely floating space station is in an unstable
configuration.
Since there are three independent rigid motions, we need to impose three constraints
on the structure in order to stabilize it. Grounding one of the nodes, i.e., preventing it
from moving by attaching it to a fixed support, will serve to eliminate the two translational
instabilities. For example, setting u3 = (0, 0) has the effect of fixing the third node
of the space station to a support by preventing any displacements thereof. With this
specification, we can eliminate the variables associated with that node, and thereby delete
the corresponding columns of the incidence matrix leaving the reduced incidence matrix

3
0
0
2

2

3
1
3
A? = 12
.

2
2
2
0

3/7/03

216

c 2003

Peter J. Olver

The kernel of A? is now only one-dimensional, spanned by the single vector


z?3 =

3
2

1
2

0 1

which corresponds to (the linear approximation of) the rotations around the fixed node. To
prevent the structure from rotating, we can also fix the second node, by further requiring
u2 = (0, 0). This allows us to eliminate the third and fourth columns of the incidence
matrix and the resulting doubly reduced incidence matrix

3
2
3
2

1
2

A?? =

1
2

Now ker A?? = {0} is trivial, and hence the corresponding reduced stiffness matrix
K ?? = (A?? )T A?? =

1
2
3
2

12

3
2

1
2

0
12
0
0

3
2
3
2

1
2

3
2

is positive definite. The space station with two fixed nodes is a stable structure, which can
now support an arbitrary external forcing. (Although forces on the fixed nodes now have
no effect since they are no longer allowed to move.)
In general, a planar structure without any fixed nodes will have at least a threedimensional kernel, corresponding to the rigid planar motions of translations and (linear
approximations to) rotations. To stabilize the structure, one must fix two (non-coincident)
nodes. A three-dimensional structure without any fixed supports will admit 6 independent
rigid motions in its kernel. Three of these correspond to rigid translations in the three
coordinate directions, while the other three correspond to linear approximations to the
rigid rotations around the three coordinate axes. To eliminate the rigid motion instabilities
of the structure, one needs to fix three non-collinear nodes; details can be found in the
exercises.
Even after attaching a sufficient number of nodes to fixed supports so as to eliminate
all possible rigid motions, there may still remain nonzero vectors in the kernel of the
reduced incidence matrix of the structure. These indicate additional instabilities in which
the shape of the structure can deform without any applied force. Such non-rigid motions
are known as mechanisms of the structure. Since a mechanism moves the nodes without
elongating any of the bars, it does not induce any internal forces. A structure that admits
a mechanism is unstable even very tiny external forces may provoke a large motion, if
not collapse.
Example 6.7. Consider the three bar structure of Example 6.5, but now with its
two ends attached to supports, as pictured in Figure 6.9. Since we are fixing nodes 1 and
4, so u1 = u4 = 0, we should remove the first two and last column pairs from the full
3/7/03

217

c 2003

Peter J. Olver

Figure 6.9.

Three Bar Structure with Fixed Supports.

incidence matrix (6.42), leading to the reduced incidence matrix

1
0
0

A? = 1 0 1
0 .

1
1
0
0 2 2

The structure no longer admits any rigid motions. However, the kernel of A ? is onedimensional, spanned by reduced displacement vector
T

z? = ( 1 1 1 1 ) ,
whose entries indicate certain displacements of the two remaining free nodes corresponding
T
to the unstable mechanism that displaces the second node in the direction u 2 = ( 1 1 )
T
and the third node in the direction u3 = ( 1 1 ) . Geometrically, then, z? represents
the displacement where node 2 moves down and to the left at a 45 angle, while node 3
moves simultaneously up and to the left at a 45 angle. This mechanism does not alter
the lengths of the three bars (at least in our linear approximation regime) and so requires
no net force to be set into motion.
As with the rigid motions of the space station, an external forcing vector f ? will only
maintain equilibrium when it lies in the corange of A? , and hence must be orthogonal to
T
all the mechanisms in ker A? = (corng A? ) . Here, the nodal forces f 2 = ( f2 , g2 ) and
T
f 3 = ( f3 , g3 ) must satisfy the balance law
z? f ? = f2 g2 + f3 + g3 = 0.
If this fails, the equilibrium equation has no solution, and the stucture will move. For
example, a uniform horizontal force f2 = f3 = f , g2 = g3 = 0, will induce the mechanism,
whereas a uniform vertical force, f2 = f3 = 0, g2 = g3 = g, will maintain equilibrium. In
the latter case, the solution to the equilibrium equations

3
1
1
0
2
2

1
1
0
0
2
? ?
?
?
? T ?
2
K u =f ,
where
K = (A ) A =
,
1
3
1 0

2
2
0

3/7/03

218

21

1
2

c 2003

Peter J. Olver

Figure 6.10.

Reinforced Planar Structure.

is indeterminate, since we can add in any element of ker K ? = ker A? , so


T

u? = ( 3 g 5 g 2 g 0 ) + t ( 1 1 1 1 ) .
In other words, the equilibrium position is not unique, since we can still move the structure
using the mechanism while maintaining the overall force balance. The elongations and
internal forces

T
y = e = A ? u? = ( 2 1 2 ) ,
are well-defined, indicating that, under our stabilizing uniform vertical force, all three bars
are compressed, with the two diagonal bars experiencing 41.4% more compression than
the horizontal bar.
Remark : Just like the rigid rotations, the mechanisms described here are linear approximations to the actual nonlinear motions. In a physical structure, the vertices will
move along curves whose tangents at the initial configuration are the directions indicated
by the mechanism. In certain cases, a structure can admit an linearly approximate mechanism, but one that cannot be physically realized due to the nonlinear constraints imposed
by the geometrical configurations of the bars. Nevertheless, such a structure is at best
borderline stable, and should not be used in any real-world applications.
We can always stabilize a structure by first fixing two or three nodes to eliminate
rigid motions, and then adding in extra bars to eliminate mechanisms. In the preceding
example, suppose we attach an additional bar connecting nodes 2 and 4, leading to the
reinforced structure in Figure 6.10. The revised incidence matrix is

A=

12
0
0
0

12 12

0 1

0
0

0 310

1
2

0
0
1
10

0
1
12
0

1
2
0
0
0

1
2
3

10

12

110

and is obtained from (6.42) by including another row representing the added bar. When
3/7/03

219

c 2003

Peter J. Olver

nodes 1 and 4 are fixed, the reduced incidence matrix


1
1
0
2
2

1
0
1
A? =

0
0
12

310 110
0

1
2
0

has trivial kernel, ker A? = {0}, and hence the structure is stable. It admits no mechanisms, and can support any configuration of forces (within reason mathematically the
structure will support an arbitrarily large external force, but very large forces will take us
outside the linear regime described by the model, and the structure will be crushed!).
This particular case is statically determinate owing to the fact that the incidence
matrix is square and nonsingular, which implies that one can solve the force balance
equations (6.50) directly for the internal forces without the necessity of solving the full
displacement equations (6.45). For instance, a uniform downwards vertical force f 2 =
f3 = 0, g2 = g3 = 1, e.g., gravity, will produce the internal forces

y2 = 1,
y3 = 2,
y4 = 0
y1 = 2,
indicating that bars 1, 2 and 3 are experiencing compression, while, interestingly, the
reinforcing bar 4 remains unchanged in length and hence experiences no internal force.
Assuming the bars are all of the same material, and taking the elastic constant to be 1, so
C = I , then

12
1
1
0
5
5

1
3
0
0

5
K ? = (A? )T A? = 5
.
3
1
1 0

2
2
0

21

1
2

The solution to the reduced equilibrium equations is


T

so
u2 = 12 32 ,
u? = 12 32 32 72 ,

T
u3 = 23 27 .

give the displacements of the two nodes under the applied force. Both are moving down and
to the left, with the third node moving relatively farther owing to its lack of reinforcement.
Suppose we reinforce the structure yet further by adding in a bar connecting nodes 1
and 3. The resulting reduced incidence matrix

1
1
0
0
2
2

1
0
1
0

1
1

0
0

A =
2
2

3
1
0
0

10
10
0

3
10

1
10

again has trivial kernel, ker A? = {0}, and hence the structure is stable. Indeed, adding
in extra bars to a stable structure cannot cause it to lose stability. (In matrix language,
3/7/03

220

c 2003

Peter J. Olver

A Swing Set.

Figure 6.11.

appending additional rows to a matrix cannot increase the size of its kernel; see Exercise .)
Since the incidence matrix is rectangular, the structure is now statically indeterminate and
we cannot determine the internal forces without first solving the full equilibrium equations
(6.45) for the displacements. The stiffness matrix is

12
5
1
5

K = (A ) A =
1
?

? T

1
5
3
5

12
5
51

0
.
1
5

3
5

For the same uniform vertical force, the displacement u? = (K ? )1 f ? is


u? =

1
10

1
17
17
10 10 10

so that the free nodes now move symmetrically down and towards the center of the structure. The internal forces on the bars are
q
q

4
2
4
1
y3 = 5 2,
y5 = 25 .
y1 = 5 2,
y2 = 5 ,
y4 = 5 ,
All five bars are mow experiencing compression, the two outside bars being the most
stressed, the reinforcing bars slightly more than half, while the center bar feels less than a
fifth the stress that the outside bars experience. This simple computation should already
indicate to the practicing construction engineer which of the bars in our structure are more
likely to collapse under an applied external force. By comparison, the reader should see
what happens under a uniform horizontal force, and interpret the solution physically.
Summarizing our discussion, we have the following fundamental result on stability
and equilibrium of structures.

Theorem 6.8. A structure is stable, and will maintain an equilibrium under arbitrary external forcing, if and only if its reduced incidence matrix A ? has linearly independent columns, or, equivalently, ker A? = {0}. More generally, an external force f ?
on a structure will maintain equilibrium if and only if f ? (ker A? ) , which means that
the external force is orthogonal to all rigid motions and all mechanisms admitted by the
structure.
3/7/03

221

c 2003

Peter J. Olver

Example 6.9. A swing set is to be constructed, consisting of two diagonal supports


at each end and a horizontal cross bar. Is this configuration stable, i.e., can a child swing
on it without it collapsing? The moveable joints are at positions
T

a1 = ( 1, 1, 3 ) ,

a2 = ( 4, 1, 3 ) .

The four fixed supports are at positions


T

a3 = ( 0, 0, 0 ) ,

a4 = ( 0, 2, 0 ) ,

a5 = ( 5, 0, 0 ) ,

a6 = ( 5, 2, 0 ) .

The reduced incidence matrix for the structure is calculated in the usual manner:

1
1
3
0
0
0

11
11
11
3
1
1

0
0
0

11

11
11

?
.
1
0
0
A = 1

0
0

1
1
3
11
11
11
0
0
0

1
3
1
0
0
0 11 11 11

For instance, the first three entries contained in the first row refer to the unit vector
n1 = (a1 a3 )/k a1 a3 k in the direction of the bar going from a1 to a3 . Suppose the
three bars have the same stiffness, and so (taking c1 = = c5 = 1) the reduced stiffness
matrix for the structure is

13
6
0

1
0
0
11
11

2
0
0
0
0
0
11

18
0 11
0
0
0
11
?
? T ?
K = (A ) A =

6
13
1 0
0 11
0
11

2
0
0
0
0
0
11
6
18
0
0
0 11
0
11
We find ker K ? = ker A? is one-dimensional, with basis

z? = ( 3 0 1 3 0 1 ) ,
which indicates a mechanism that cuases the swing set to collapse: the first node moves up
and to the right, while the second node moves down and to the right, the horizontal motion
being three times as large as the vertical. The structure can support forces f 1 = (f1 , g1 , h1 ),
f 2 = (f2 , g2 , h2 ), provided
3 (f1 + f2 ) h1 + h2 = 0,
requiring that the force vector f ? be orthogonal to the mechanism vector z? . Thus, as long
as the net horizontal force is in the y direction and the vertical forces on the two joints
are equal, the structure will maintain its shape. Otherwise, a reinforcing bar, say from a 1
to a6 (although this will interfere with the swinging!) or a pair of vertical bars from the
nodes to two new ground supports, will be required to completely stabilize the swing.
3/7/03

222

c 2003

Peter J. Olver

For a uniform downwards unit vertical force, f = ( 0, 0, 1, 0, 0, 1 ) , a particular


solution to (6.9) is
T

0 43 11
0 0
u? = 13
6
6

and the general solution u = u? + t z? is obtained by adding in an arbitrary element of the


kernel. The resulting forces/elongations are uniquely determined,
y = e = A ? u = A ? u? =

11
6

11
6

31

11
6

11
6

so that every bar is compressed, the middle one experiencing slightly more than half the
stress of the outer supports.
If we stabilize the structure by adding in two vertical supports at the nodes, then the
new reduced incidence matrix
1

1
3
0
0
0
11
11
11
1

1
3

0
0
0
11
11
11

1
0
0
1
0
0

1
3
1
A? =

0
0
0

11
11
11

1
3
1
0

0
0

11
11
11

0
0
1
0
0
0
0

has trivial kernel, indicating stabilization of the

6
13
0 11
11

2
0
0
11

6
0 29

11
K ? = 11
1 0
0

0
0
0
0

structure. The reduced stiffness matrix

1
0
0

0
0
0

0
0
0

13
6
0 11
11

2
0
0
11
6
29
11
0
11

is only slightly different than before, but this is enough to make it positive definite, K ? > 0,
and so allow arbitrary extrenal forcing without collapse. Under the uniform vertical force,
the internal forces are

T

y = e = A? u = 1011 1011 15 1011 1011 25 52


.

Note the overall reductions in stress in the original bars; the two new vertical bars are now
experiencing the largest amount of stress.

3/7/03

223

c 2003

Peter J. Olver

Chapter 7
Linear Functions and Linear Systems
We began this book by learning how to systematically solve systems of linear algebraic
equations. This elementary problem formed our launching pad for developing the fundamentals of linear algebra. In its initial form, matrices and vectors were the primary focus
of our study, but the theory was developed in a sufficiently general and abstract form that
it can be immediately applied to many other important situations particularly infinitedimensional function spaces. Indeed, applied mathematics deals, not just with algebraic
equations, but also differential equations, difference equations, integral equations, integrodifferential equations, differential delay equations, control systems, and many, many other
types of systems not all of which, unfortunately, can be adequately developed in this
introductory text. It is now time to assemble what we have learned about linear matrix
systems and place the results in a suitably general framework that will give us important insight, as well as certain fundamental principles that can be applied when solving
completely general linear problems.
The most basic underlying object of linear systems theory is the vector space, and
we have already seen that the elements of vector spaces can be vectors, or functions, or
even vector-valued functions. The seminal ideas of span, linear independence, basis and
dimension are equally applicable and equally vital in more general contexts, particularly
function spaces. Just as vectors in Euclidean space are prototypes of general elements of
vector spaces, matrices are also prototypes of much more general objects, known as linear
functions. Linear functions are also known as linear maps or linear operators, particularly
when we deal with function spaces, and include linear differential operators, linear integral
operators, evaluation of a function or its derivative at a point, and many other important
operations on functions. Generalized functions, such as the delta function to be introduced
in Chapter 10, are, in fact, properly formulated as linear operators on a suitable space of
functions. As such, linear maps form the simplest class of functions on vector spaces.
Nonlinear functions can often be closely approximated by linear functions, generalizing
the calculus approximation of a function by its tangent line. As a consequence, linear
functions must be thoroughly understood before any serious progress can be made in the
vastly more complicated nonlinear regime.
A linear system is just an equation satisfied by a linear function. The most basic
linear system is a system of linear algebraic equations, which we write in matrix notation
A x = b. Besides matrix systems, linear systems include linear differential equations, linear
boundary value problems, linear partial differential equations, and many, many others in
a common conceptual framework. The fundamental ideas of linear superposition, and the
relation between the solutions to inhomogeneous and homogeneous systems, underly the
3/7/03

224

c 2003

Peter J. Olver

structure of the solution space of all linear systems. You have no doubt encountered many
of these ideas in your first course on ordinary differential equations; they have also already
appeared in our development of the theory underlying the solution of linear algebraic
systems.
In geometry, linear functions are interpreted as linear transformations of space (or
space-time), and, as such, lie at the foundations of computer graphics and the mathematical formulation of symmetry. Basic geometrical transformations, including rotations,
scaling, reflections, shears and so on, are governed by linear transformations. Linear operators on infinite-dimensional function spaces are the basic objects of quantum mechanics.
Each quantum mechanical observable (mass, energy, momentum) is formulated as a linear
operator on an infinite-dimensional Hilbert space the space of wave functions or states
of the system. The dynamics of the quantum mechanical system is governed by the linear
Schrodinger equation. It is remarkable that quantum mechanics is an entirely linear theory, whereas classical and relativistic mechanics are inherently nonlinear! The holy grail
of modern physics the unification of general relativity and quantum mechanics is to
resolve the apparent incompatibility of the microscopic and macroscopic physical regimes.

7.1. Linear Functions.


We begin our study of linear functions with the basic definition. For simplicity, we shall
concentrate on real linear functions between real vector spaces. Extending the concepts
and constructions to complex linear functions on complex vector spaces is not difficult.
Definition 7.1. Let V and W be real vector spaces. A function L: V W is called
linear if it obeys two basic rules:
L[ v + w ] = L[ v ] + L[ w ],

L[ c v ] = c L[ v ],

(7.1)

We will call V the domain space and W the target space for L.
In particular, setting c = 0 in the second condition implies that a linear function
always maps the zero element in V to the zero element in W , so
L[ 0 ] = 0.

(7.2)

We can readily combine the two defining conditions into a single rule
L[ c v + d w ] = c L[ v ] + d L[ w ],

for all

v, w V,

c, d R,

(7.3)

that characterizes linearity of a function L. An easy induction proves that a linear function
respects linear combinations, so
L[ c1 v1 + + ck vk ] = c1 L[ v1 ] + + ck L[ vk ]

(7.4)

for any c1 , . . . , ck R and v1 , . . . , vk V .

The term target is used here to avoid later confusion with the range of L.

3/7/03

225

c 2003

Peter J. Olver

The interchangeable terms linear map, linear operator and, when V = W , linear
transformation are all commonly used as alternatives to linear function, depending on
the circumstances and taste of the author. The term linear operator is particularly
useful when the underlying vector space is a function space, so as to avoid confusing the
two different uses of the word function. As usual, we will sometimes refer to the elements
of a vector space as vectors even though they might be functions or matrices or something
else, depending upon the particular space.
Example 7.2. The simplest linear function is the zero function L[ v ] 0 which
maps every element v V to the zero vector in W . Note that, in view of (7.2), this is
the only constant linear function. A nonzero constant function is not, despite its evident
simplicity, linear. Another simple but important linear function is the identity function
I = I V : V V which leaves every vector unchanged: I [ v ] = v. Slightly more generally,
the operation of scalar multiplication Ma [ v ] = a v by a fixed scalar a R defines a linear
function from V to itself.
Example 7.3. Suppose V = R. We claim that every linear function L: R R has
the form
y = L(x) = a x,
for some constant a. Indeed, writing x R as a scalar product x = x 1, and using the
second property in (7.1), we find
L(x) = L(x 1) = x L(1) = a x,

where

a = L(1).

Therefore, the only scalar linear functions are those whose graph is a straight line passing
through the origin.
Warning: Even though the graph of the function
y = a x + b,

(7.5)

is a straight line, this is not a linear function unless b = 0 so the line goes through
the origin. The correct name for a function of the form (7.5) is an affine function; see
Definition 7.60 below.
Example 7.4. Let V = R n and W = R m . Let A be an m n matrix. Then the
function L[ v ] = A v given by matrix multiplication is easily seen to be a linear function.
Indeed, the requirements (7.1) reduce to the basic distributivity and scalar multiplication
properties of matrix multiplication:
A(v + w) = A v + A w,

A(c v) = c A v,

for all

v, w R n ,

c R.

In fact, every linear function between two Euclidean spaces has this form.
Theorem 7.5. Every linear function L: R n R m is given by matrix multiplication,
L[ v ] = A v, where A is an m n matrix.
Warning: Pay attention to the order of m and n. While A has size m n, the linear
function L goes from R n to R m .
3/7/03

226

c 2003

Peter J. Olver

Figure 7.1.

Linear Function on Euclidean Space.

Proof : The key idea is to look at what the linear function does to the basis vectors.
Let e1 , . . . , en be the standard basis of R n , and let b
e1 , . . . , b
em be the standard basis of R m .
(We temporarily place hats on the latter to avoid confusing the two.) Since L[ e j ] R m ,
we can write it as a linear combination of the latter basis vectors:

a1j
a2j

L[ ej ] = aj = . = a1j b
e1 + a2j b
e2 + + amj b
em ,
j = 1, . . . , n.
(7.6)
..
amj
Let us construct the m n matrix

A = ( a 1 a2

a
11
a

21
T
. . . an ) =
..
.

am1

a12
a22
..
.

...
...
..
.

a1n
a2n
..
.

am2

...

amn

(7.7)

whose columns are the image vectors (7.6). Using (7.4), we then compute the effect of L
T
on a general vector v = ( v1 , v2 , . . . , vn ) R n :
L[ v ]= L[ v1 e1 + + vn en ] = v1 L[ e1 ] + + vn L[ en ] = v1 a1 + + vn an = A v.
The final equality follows from our basic formula (2.15) connecting matrix multiplication
and linear combinations. We conclude that the vector L[ v ] coincides with the vector A v
obtained by multiplying v by the coefficient matrix A.
Q.E.D.
The proof of Theorem 7.5 shows us how to construct the matrix representative of
a given linear function L: R n R m . We merely assemble the image column vectors
a1 = L[ e1 ], . . . , an = L[ en ] into an m n matrix A.
Example 7.6. In the case of a function from R n to itself, the two basic linearity
conditions (7.1) have a simple geometrical interpretation. Since vector addition is the
same as completing the parallelogram indicated in Figure 7.1, the first linearity condition
requires that L map parallelograms to parallelograms. The second linearity condition says
that if we stretch a vector by a factor c, then its image under L must also be stretched by
3/7/03

227

c 2003

Peter J. Olver

Linearity of Rotations.

Figure 7.2.

the same amount. Thus, one can often detect linearity by simply looking at the geometry
of the function.
As a specific example, consider the function R : R 2 R 2 that rotates the vectors in
the plane around the origin by a specified angle . This geometric transformation clearly
preserves parallelograms, as well as stretching see Figure 7.2 and hence defines a
linear function. In order to find its matrix representative, we need to find out where the
basis vectors e1 , e2 are mapped. Referring to Figure 7.3, we have

cos
sin
R [ e1 ] = cos e1 + sin e2 =
,
R [ e2 ] = sin e1 + cos e2 =
.
sin
cos
According to the general recipe (7.7), we assemble these two column vectors to obtain the
matrix form of the rotation transformation, and so

cos sin
R [ v ] = A v,
where
A =
.
(7.8)
sin cos

x
Therefore, rotating a vector v =
through angle gives the vector
y
b = R [ v ] = A v =
v

cos
sin

sin
cos

x
x cos y sin
=
y
x sin + y cos

with coordinates

x
b = x cos y sin ,

yb = x sin + y cos .

These formulae can be proved directly, but, in fact, are a consequence of the underlying
linearity of rotations.
3/7/03

228

c 2003

Peter J. Olver

e2

Figure 7.3.

e1

Rotation in R .

Linear Operators
So far, we have concentrated on linear functions on Euclidean space, and seen that
they are all represented by matrices. For function spaces, there is a much wider variety of
linear operators available, and a complete classification is out of the question. Let us look
at some of the main representative examples that arise in applications. Recall that C 0 [ a, b ]
denotes the vector space consisting of all continuous functions on the interval [ a, b ].
Example 7.7. (i ) Evaluation of the function at a point, L[ f ] = f (x0 ), defines a
linear operator L: C0 [ a, b ] R, because
L[ c f + d g ] = c f (x0 ) + d g(x0 ) = c L[ f ] + d L[ g ]
for any functions f, g C0 [ a, b ] and scalars (constants) c, d.
(ii ) Another real-valued linear function is the integration operator
Z b
I[ f ] =
f (x) dx.

(7.9)

Linearity of I is an immediate consequence of the basic integration identity


Z b
Z b
Z b

c f (x) + d g(x) dx = c
f (x) dx + d
g(x) dx,
a

which is valid for arbitrary integrable which includes continuous functions f, g and
scalars c, d.
(iii ) We have already seen that multiplication of functions by a fixed scalar f (x) 7
c f (x) defines a linear map Mc : C0 [ a, b ] C0 [ a, b ]; the particular case c = 1 reduces to the
identity transformation I = M1 . More generally, if a(x) C0 [ a, b ] is a fixed continuous
function, then the operation Ma [ f (x) ] = a(x) f (x) of multiplication by a also defines a
linear transformation Ma : C0 [ a, b ] C0 [ a, b ].
(iv ) Another important linear transformation is the indefinite integral
Z x
J[ f ] =
f (y) dy.
(7.10)
a

According to the Fundamental Theorem of Calculus, the integral of a continuous function is


continuously differentiable; therefore, J: C0 [ a, b ] C1 [ a, b ] defines a linear operator from
the space of continuous functions to the space of continuously differentiable functions.
3/7/03

229

c 2003

Peter J. Olver

(v ) Vice versa, differentation of functions is also a linear operation. To be precise,


since not every continuous function can be differentiated, we take the domain space to
be the vector space of continuously differentiable functions on an interval. The derivative
operator
D[ f ] = f 0
(7.11)
defines a linear operator D: C1 [ a, b ] C0 [ a, b ]. This follows from the elementary differentiation formula
D[ c f + d g ] = (c f + d g) 0 = c f 0 + d g 0 = c D[ f ] + d D[ g ],
valid whenever c, d are constant.
The Space of Linear Functions
Given vector spaces V, W , we use L( V, W ) to denote the set of all linear functions
L: V W . We claim that L( V, W ) is itself a vector space. We add two linear functions
L, M L( V, W ) in the same way we add general functions: (L + M )[ v ] = L[ v ] + M [ v ].
The reader should check that L + M satisfies the linear function axioms (7.1) provided L
and M do. Similarly, multiplication of a linear function by a scalar c R is defined so
that (c L)[ v ] = c L[ v ], again producing a linear function. The verification of the vector
space axioms is left to the reader.
In particular, if V = R n and W = R m , then Theorem 7.5 implies that we can identify
n
L( R , R m ) with the space Mmn of all m n matrices. Addition of linear functions
corresponds to matrix addition, while scalar multiplication coincides with the usual scalar
multiplication of matrices. Therefore, the space of all m n matrices forms a vector
space a fact we already knew. A basis for Mmn is given by the m n matrices Eij ,
1 i m, 1 j n, which have a single 1 in the (i, j) position and zeros everywhere
else. Therefore, the dimension of Mmn is m n. Note that Eij corresponds to the specific
linear transformation mapping ej R n to b
ei R m and every other ek R n to zero.
Example 7.8. The space of linear transformations
of the plane, L( R 2 , R 2 ) is iden

a b
. The standard basis of M22
tified with the space M22 of 2 2 matrices A =
c d
consists of the 4 = 2 2 matrices

1 0
0 1
0 0
0 0
E11 =
,
E12 =
,
E21 =
,
E22 =
.
0 0
0 0
1 0
0 1
Indeed, we can uniquely write any other matrix

a b
= a E11 + b E12 + c E21 + d E22 ,
A=
c d

as a linear combination of these four basis matrices.

In infinite-dimensional situations, one usually imposes additional restrictions, e.g., continuity


or boundedness of the linear operators. We can safely relegate these more subtle distinctions to a
more advanced treatment of the subject. See [ 102 ] for a full discussion of the rather sophisticated
analytical details, which do play an important role in serious quantum mechanical applications.

3/7/03

230

c 2003

Peter J. Olver

A particularly important case is when the target space of the linear functions is R.
Definition 7.9. The dual space to a vector space V is defined as the space V =
L( V, R ) consisting of all real-valued linear functions L: V R.
If V = R n , then every linear function L: R n R is given by multiplication by a 1 n
matrix, i.e., a row vector. Explicitly,
v
1
v2

L[ v ] = a v = a1 v1 + + an vn ,
where a = ( a1 a2 . . . an ),
v=
.. .
.
vn

Therefore, we can identify the dual space (R n ) with the space of row vectors with n
entries. In light of this fact, the distinction between row vectors and column vectors is now
seen to be much more sophisticated than mere semantics or notation. Row vectors should
be viewed as real-valued linear functions the dual objects to column vectors.
The standard dual basis 1 , . . . , n of (R n ) consists of the standard row basis vectors,
namely j is the row vector with 1 in the j th slot and zeros elsewhere. The j th dual basis
element defines the linear function
Ej [ v ] = j v = v j ,
that picks off the j th coordinate of v with respect to the original basis e1 , . . . , en . Thus,
the dimension of V = R n and its dual (R n ) are both equal to n. The dual basis functions
are uniquely defined by the requirements

1
i = j,
Ei (ej ) = ij
(7.12)
0
i 6= j.
The symbol ij defined in (7.12) is known as the Kronecker delta. Further developments
can be found in the exercises.
An inner product structure provides a mechanism for identifying a vector space and
its dual. However, it should be borne in mind that this identification will depend upon
the choice of inner product.
Theorem 7.10. Let V be a finite-dimensional real inner product space. Then every
linear function L: V R is given by an inner product
L[ v ] = h a ; v i

(7.13)

with a unique vector a V . The correspondence between L and a allows us to identify


V 'V.
Proof : Let u1 , . . . , un be an orthonormal basis of V . (If necessary, we can use the
GramSchmidt process to generate such a basis.) If we write v = x1 u1 + + xn un , then,
by linearity,
L[ v ] = x1 L[ u1 ] + + xn L[ un ] = a1 x1 + + an xn ,
3/7/03

231

c 2003

Peter J. Olver

where ai = L[ ui ]. On the other hand, if we write a = a1 u1 + + an un , then, by


orthonormality of the basis,
ha;vi =

n
X

a i xj h ui ; u j i = a 1 x1 + + a n xn .

i,j = 1

Thus equation (7.13) holds, which completes the proof.

Q.E.D.

Remark : In the particular case when V = R n is endowed with the standard dot
product, then Theorem 7.10 identifies a row vector representing a linear function with
the corresponding column vector obtained by transposition a 7 aT . Thus, the nave
identification of a row and a column vector is, in fact, an indication of a much more subtle
phenomenon that relies on the identification of R n with its dual based on the Euclidean
inner product. Alternative inner products will lead to alternative, more complicated,
identifications of row and column vectors.
Important: Theorem 7.10 is not true if V is infinite-dimensional. This fact will have
important repercussions for the analysis of the differential equations of continuum mechanics, which will lead us immediately into the much deeper waters of generalized function
theory. Details will be deferred until Section 10.2.
Composition of Linear Functions
Besides adding and multiplying by scalars, one can also compose linear functions.
Lemma 7.11. Let V, W, Z be vector spaces. If L: V W and M : W Z are linear
functions, then the composite function M L: V Z, defined by (M L)[ v ] = M [ L[ v ] ]
is linear.
Proof : This is straightforward:
(M L)[ c v + d w ] = M [ L[ c v + d w ] ] = M [ c L[ v ] + d L[ w ] ]
= c M [ L[ v ] ] + d M [ L[ w ] ] = c (M L)[ v ] + d (M L)[ w ],
where we used, successively, the linearity of L and then of M .

Q.E.D.

For example, if L[ v ] = A v maps R n to R m , and M [ w ] = B w maps R m to R l , so


that A is an m n matrix and B is a l m matrix, then the composition is
(M L)[ v ] = M [ L[ v ] ] = B(A v) = (B A) v,
and hence the composition M L: R n R l corresponds to the l n product matrix BA.
In other words, on Euclidean space, composition of linear functions is the same as matrix
multiplication!
As with matrix multiplication, composition of (linear) functions is not commutative.
In general the order of the constituents makes a difference.
3/7/03

232

c 2003

Peter J. Olver

Example 7.12. Composing two rotations gives another rotation: R R = R+ .


In other words, if we first rotate by angle and then by angle , the net result is rotation
by angle + . On the matrix level, in view of (7.8), this implies that
A A = A+ ,
or, explicitly,

cos
cos sin
sin
sin cos

sin
cos

cos( + ) sin( + )
sin( + ) cos( + )

Performing the matrix multiplication on the left hand side, we deduce the well-known
trigonometric addition formulae
cos( + ) = cos cos sin sin ,

sin( + ) = cos sin + sin cos .

In fact, this computation constitutes a bona fide proof of these two identities!
Example 7.13. One can build up more sophisticated linear operators on function
space by adding and composing simpler ones. In particular, the linear higher order derivative operators are obtained by composing the derivative operator D, defined in (7.11), with
itself. For example,
D2 [ f ] = D D[ f ] = D[ f 0 ] = f 00
is the second derivative operator. One needs to exercise some care about the domain of
definition, since not every function is differentiable. In general,
Dk [ f ] = f (k) (x)

defines a linear operator

D k : Cn [ a, b ] Cnk [ a, b ]

for any n k.
If we compose D k with the linear operation of multiplication by a fixed function
a(x) Cnk [ a, b ] we obtain the linear operator f (x) 7 a(x) D k (f (x)) = a(x) f (k) (x).
Finally, a general linear ordinary differential operator of order n
L = an (x) Dn + an1 (x) Dn1 + + a1 (x) D + a0 (x)

(7.14)

is obtained by adding such linear operators. If the coefficient functions a 0 (x), . . . , an (x)
are continuous, then
L[ u ] = an (x)

dn u
dn1 u
du
+
a
(x)
+ + a1 (x)
+ a0 (x)u.
n1
n
n1
dx
dx
dx

(7.15)

is continuous provided u Cn [ a, b ], and so L defines a linear operator from Cn [ a, b ] to


C0 [ a, b ]. The most important case but certainly not the only one arising in applications
is when the coefficients ai (x) = ci of L are all constant.
Inverses
The inverse of a linear function is defined in direct analogy with the Definition 1.13
of the inverse of a (square) matrix.
3/7/03

233

c 2003

Peter J. Olver

Definition 7.14.
composite functions

Let L: V W and M : W V be linear functions. If both


LM = IW,

M L = IV ,

(7.16)

are equal to the identity function, then we call M the inverse of L and write M = L 1 .
The two conditions (7.16) require
L[ M [ w ] ] = w

for all

w W,

M [ L[ v ] ] = v

for all

v V.

Of course, if M = L1 is the inverse of L, then so is L = M 1 the inverse of M since the


conditions are symmetric.
If V = R n , W = R m , so that L and M are given by matrix multiplication, by A and
B respectively, then the conditions (7.16) reduce to the usual conditions
AB = I,

BA = I,

for matrix inversion, cf. (1.39). Therefore B = A1 is the inverse matrix. In particular,
for L to have an inverse, we need m = n and its coefficient matrix A to be square and
nonsingular.
Example 7.15. The Fundamental Theorem of Calculus says, roughly, that differentiation
D[ f ] = f 0
and indefinite integration
J[ f ] =

f (y) dy
a

are inverse operations. More precisely, the derivative of the indefinite integral of f is
equal to f , and hence
Z x
d
D[ J[ f (x) ] ] =
f 0 (y) dy = f (x).
dx a
In other words, the composition
D J = I C0 [ a,b ]
defines the identity operator on C0 [ a, b ]. On the other hand, if we integrate the derivative
of a continuously differentiable function, we obtain
Z x
0
f 0 (y) dy = f (x) f (a).
J[ D[ f (x) ] ] = J[ f (x) ] =
a

Therefore
J[ D[ f (x) ] ] = f (x) f (a),

and so

J D 6= I C1 [ a,b ]

is not the identity operator. Therefore, D is a left inverse for J, but not a right inverse!
This surprising phenomenon could not be anticipated from the finite-dimensional matrix theory. Indeed, if a matrix A has a left inverse B, then B is automatically a right
inverse too, and we write B = A1 as the inverse of A. On an infinite-dimensional vector
3/7/03

234

c 2003

Peter J. Olver

space, a linear operator may possess one inverse without necessarily possessing the other.
However, if both a left and a right inverse exist they must be equal; see Exercise .
If we restrict D to the subspace V = { f | f (a) = 0 } C1 [ a, b ] consisting of all continuously differentiable functions that vanish at the left hand endpoint, then J: C 0 [ a, b ] V ,
and D: V C0 [ a, b ] are, by the preceding argument, inverse linear operators: D J =
I C0 [ a,b ] , and J D = I V . Note that V ( C1 [ a, b ] ( C0 [ a, b ]. Thus, we have the curious
and disconcerting infinite-dimensional phenomenon that J defines a one-to-one, invertible,
linear map from a vector space C0 [ a, b ] to a proper subspace V ( C0 [ a, b ]. This paradoxical
situation cannot occur in finite dimensions, since a linear map can only be invertible when
the domain and target spaces have the same dimension, and hence its matrix is necessarily
square!

7.2. Linear Systems.


The abstract notion of a linear system unifies linear systems of algebraic equations,
linear ordinary differential equations, linear partial differential equations, linear boundary
value problems, and a wide variety of other linear problems, all in a common conceptual
framework. The idea is to replace matrix multiplication by a general linear function.
Many of the theoretical results we learned in the algebraic context have, when suitably
formulated, direct counterparts in these more general situations.
Definition 7.16. A linear system is an equation of the form
L[ u ] = f ,

(7.17)

where L: V W is a linear function between vector spaces. The right hand side f W
is an element of the target space, and the desired solution u V belongs to the domain
space. The system is called homogeneous if f = 0; otherwise, it is called inhomogeneous.
Example 7.17. If V = R n and W = R m , then, according to Theorem 7.5, every
linear function L: R n R m is given by matrix multiplication: L[ u ] = A u. Therefore, in
this particular case, every linear system is a matrix system, A u = f .
Example 7.18. A linear ordinary differential equation takes the form L[ u ] = f ,
where L is a linear differential operator of the form (7.14), and the right hand side is, say,
a continuous function. Written out, the differential equation takes the familiar form
dn1 u
du
dn u
+
a
(x)
+ + a1 (x)
+ a0 (x)u = f (x).
(7.18)
n1
n
n1
dx
dx
dx
The student should already have some familiarity with solving the constant coefficient
case. Appendix C describes a method for solving more general, non-constant coefficient
equations based on their Taylor series expansion.
L[ u ] = an (x)

Example 7.19. Let K(x, y) be a function of two variables which is continuous for
all a x, y b. Then the integral
Z b
FK [ u ] =
K(x, y) u(y) dy
a

3/7/03

235

c 2003

Peter J. Olver

defines a linear operator FK : C0 [ a, b ] C0 [ a, b ]. A linear integral equation of the form


Z b
K(x, y) u(y) dy = f (x)
a

defines a Fredholm integral equation, which were studied by Fredholm when he arrived at
his Alternative Theorem 5.45. Another, slightly different example, is the linear integral
operator
Z
x

VK [ u ] =

K(x, y) u(y) dy,

with variable upper limit, which defines a linear Volterra integral equation
Z x
K(x, y) u(y) dy = f (x),
a

first studied, slightly earlier, by the Italian mathematician Vito Volterra.

Example 7.20. One can combine linear maps to form more complicated, mixed
types of linear systems. For example, consider a typical initial value problem
u00 + u0 2 u = x,

u(0) = 1,

u0 (0) = 1,

(7.19)

for a scalar unknown function u(x). The differential equation is given as a linear system
L[ u ] = x,

where

L[ u ] = (D 2 + D 2)[ u ] = u00 + u0 2 u

is a linear, constant coefficient differential operator. If we further define

00
u + u0 2 u
L[ u ]
,
u(0)
M [ u ] = u(0) =
0
0
u (0)
u (0)

then M defines a linear map whose domain is the space C2 of twice continuously differentiable
functions,
and whose range is the vector space V consisting of all triples

f (x)

v=
a , where f C0 is a continuous function and a, b R are real constants. We
b
leave it to the reader to convince themselves that this is indeed a vector space under the
evident addition and scalar multiplication operations. Inthis
way, we can write the initial
x
value problem (7.19) in linear systems form as M [ u ] = 1 .
1
A similar construction applies to linear boundary value problems. For example, the
boundary value problem
u00 + u = ex ,

u(0) = 1,

u(1) = 2,

This is a particular case of the general Cartesian product construction between vector spaces,
with V = C0 R R. See Exercise for details.

3/7/03

236

c 2003

Peter J. Olver

is in the form of a linear system


M[u] = f,

where

u00 + u
M [ u ] = u(0) ,
u(1)

ex
f = 1 .
2

Note that M : C2 V defines a linear map having the same domain and target space as
above.
The Superposition Principle
Before attempting to tackle inhomogeneous linear systems, it will help to look first
at the homogeneous case. The most important fact is that homogeneous linear systems
admit a superposition principle, that allows one to construct new solutions from known
solutions. The word superposition refers to taking linear combinations of solutions.
Consider a general homogeneous linear system
L[ z ] = 0

(7.20)

where L is a linear function. If we are given two solutions, say z 1 and z2 , so that
L[ z1 ] = 0,

L[ z2 ] = 0,

then their sum z1 + z2 is automatically a solution, since, using the linearity of L,


L[ z1 + z2 ] = L[ z1 ] + L[ z2 ] = 0 + 0 = 0.
Similarly, given a solution z and any scalar c, the scalar multiple c z is automatically a
solution, since
L[ c z ] = c L[ z ] = c 0 = 0.
Combining these two elementary observations, we can now state the general superposition
principle. The proof follows immediately from formula (7.4).
Theorem 7.21. If z1 , . . . , zk are all solutions to the homogeneous linear system
L[ z ] = 0, and c1 , . . . , ck are any scalars, then the linear combination c1 z1 + + ck zk is
also a solution.
As with matrices, we call the solution space to the homogeneous linear system (7.20)
the kernel of the linear function L. The superposition Theorem 7.21 implies that the kernel
always forms a subspace.
Theorem 7.22. If L: V W is a linear function, then its kernel
ker L = { z V | L[ z ] = 0 } V

(7.21)

forms a subspace of the domain space V .


3/7/03

237

c 2003

Peter J. Olver

As we know, in the case of linear matrix systems, the kernel can be explicitly determined by the basic Gaussian elimination algorithm. For more general linear operators, one
must develop appropriate solution techniques for solving the homogeneous linear system.
Here is a simple example from the theory of linear, constant coefficient ordinary differential
equations.
Example 7.23. Consider the second order linear differential operator
L = D2 2 D 3,

(7.22)

which maps the function u(x) to the function


L[ u ] = (D 2 2 D 3)[ u ] = u00 2 u0 3 u.
The homogeneous system takes the form of a homogeneous, linear, second order ordinary
differential equation
L[ u ] = u00 2 u0 3 u = 0.
(7.23)
In accordance with the standard solution method, we plug the exponential ansatz
u = e x
into the equation. The result is
L[ e x ] = D2 [ e x ] 2 D[ e x ] 3 e x = (2 2 3)e x ,
and therefore, e x is a solution if and only if satisfies the characteristic equation
0 = 2 2 3 = ( 3)( + 1).
The two roots are 1 = 3, 2 = 1, and hence
u1 (x) = e3 x ,

u2 (x) = e x ,

(7.24)

are two linearly independent solutions of (7.23). According to the general superposition
principle, every linear combination
u(x) = c1 u1 (x) + c2 u2 (x) = c1 e3 x + c2 e x
of these two basic solutions is also a solution, for any choice of constants c 1 , c2 . In fact,
this two-parameter family of solutions consitutes the most general solution to the ordinary
differential equation (7.23). Thus, the kernel of the second order differential operator (7.22)
is two-dimensional, with basis given by the independent exponential solutions (7.24).

The German word ansatz (plural ans


atze) refers to the method of finding a solution to a
complicated equation by guessing the solutions form in advance. Usually, one cannot guess the
solution completely, and so the ansatz will have one or more free parameters in this case the
constant exponent that, with some luck, can be rigged up to fulfill the requirements imposed
by the equation. Thus, a reasonable English translation of ansatz is inspired guess.

3/7/03

238

c 2003

Peter J. Olver

In general, the solution space to an nth order homogeneous linear ordinary differential
equation
L[ u ] = an (x)

dn u
dn1 u
du
+
a
(x)
+

+
a
(x)
+ a0 (x)u = 0
n1
1
dxn
dxn1
dx

(7.25)

forms a subspace of the vector space Cn [ a, b ] of n times continuously differentiable functions, since it is just the kernel of a linear differential operator L: C n [ a, b ] C0 [ a, b ].
The fact that the solution space (kernel) is a subspace implies that linear combinations of
solutions are also solutions.
The differential operator L is called nonsingular on an open interval [ a, b ] if its coefficients an (x), . . . , a0 (x) C0 [ a, b ] are continuous functions and its leading coefficient
an (x) 6= 0 does not vanish for all a < x < b. The basic existence result governing
nonsingular homogeneous linear ordinary differential equations can be formulated as a
characterization of the dimension of the solution space.
Theorem 7.24. The kernel of a nonsingular nth order ordinary differential operator
forms an n-dimensional subspace ker L Cn [ a, b ].
The proof relies on the fundamental existence and uniqueness theorems for ordinary
differential equations, and is discussed in Section 19.1. The fact that the kernel has dimension n implies that there exists a basis consisting of n linearly independent solutions
u1 (x), . . . , un (x) Cn [ a, b ] such that the general solution to equation (7.25) is given by a
linear combination
u(x) = c1 u1 (x) + + cn un (x),
where c1 , . . . , cn are arbitrary constants. Therefore, once we find n linearly independent
solutions of an nth order homogeneous linear ordinary differential equation, we can immediately construct the most general solution.
Remark : The condition that the leading coefficient an (x) does not vanish is essential.
Points where an (x) = 0 are known as singular points. They arise in many applications,
but must be treated separately and with care; see Appendix C. Of course, if the coefficients
are constant then there is nothing to worry about either the leading coefficient a n 6= 0,
or the operator is, in fact, of lower order.
Example 7.25. A second order Euler differential equation takes the form
L[ u ] = a x2 u00 + b x u0 + c u = 0,

(7.26)

where 0 6= a, b, c are constants. Instead of the exponential solution ansatz used in the
constant coefficient case, Euler equations are solved by using a power ansatz
u(x) = xr
with unknown exponent r. Substituting into the differential equation, we find
L[ xr ] = a r (r 1) xr + b r xr + c xr = [ a r (r 1) + b r + c ] xr = 0,
3/7/03

239

c 2003

Peter J. Olver

and hence xr is a solution if and only if r satisfies the characteristic equation


a r (r 1) + b r + c = a r 2 + (b a) r + c = 0.

(7.27)

If the characteristic equation has two real roots, r1 , r2 , then there are two linearly independent solutions u1 (x) = xr1 and u2 (x) = xr2 , and the general solution to (7.26) has the
form
r
r
u(x) = c1 | x | 1 + c2 | x | 2 .
(7.28)
(The absolute values are usually needed to ensure that the solutions remain real when
x < 0 is negative.)
The Euler equation has a singular point at x = 0, where its leading coefficient vanishes.
Theorem 7.24 assures us that the differential equation has a two-dimensional solution space
on any interval not containing the singular point. However, the number of solutions which
remain continuously differentiable at x = 0 depends on the precise values of r 1 and r2 .
The case
x2 u00 3 x u0 + 3 u = 0

has solution

u = c 1 x + c 2 x3 ,

which forms a two-dimensional subspace of C0 (R). However,


c2
,
x
and only the multiples of the first solution x are continuous at x = 0. Therefore, the
solutions that are continuous everywhere form only a one-dimensional subspace of C 0 (R).
Finally,
c
c
x2 u00 + 5 x u0 + 3 u = 0
has solution
u = 1 + 23 ,
x
x
and there are no nontrivial solutions u 6 0 that are continuous at x = 0.
x2 u00 + x u0 u = 0

has solution

u = c1 x +

Example 7.26. Consider the Laplace equation


[ u ] =

2u 2u
+ 2 =0
x2
y

(7.29)

for analytic functions u(x, y) A() defined on a domain R 2 . The Laplace equation forms a homogeneous linear partial differential equation corresponding to the partial
differential operator = x2 + y2 known as the Laplacian operator. Linearity can either
be proved directly, or by noting that is built up from the basic linear partial derivative
operators x , y by the processes of composition and addition; see Exercise .
Unlike the case of a linear ordinary differential equation, there are an infinite number
of linearly independent solutions to the Laplace equation. Examples include the trigonometric/exponential solutions
ek x cos k y,

ek x sin k y,

ek y cos k x,

ek y sin k y,

This seems an overly restrictive assumption, but in fact it can be proved that every solution
to the Laplace equation is an analytic function.

3/7/03

240

c 2003

Peter J. Olver

where k is any real constant. There are also infinitely many independent polynomial
solutions, the first few of which are
1,

x,

y,

x2 y 2 ,

x y,

x3 3 x y 2 ,

...

The reader might enjoy finding some more of the latter solutions and trying to spot the
pattern. (See below for the answer.) As usual, we can build up more complicated solutions
by taking general linear combinations of these particular ones. In fact, it will be shown that
the most general solution to the Laplace equation can be written as a convergent infinite
series in the basic polynomial solutions. Later, we will learn how to construct these and
many other solutions to the planar Laplace equation in Chapters 14 and 15.
Inhomogeneous Systems
Now we turn our attention to an inhomogeneous linear system
L[ u ] = f .

(7.30)

Unless f = 0, the solution space to (7.30) is not a subspace. (Why?) The key question is
existence is there a solution to the system? In the homogeneous case, existence is not
an issue, since u = 0 is always a solution to L[ u ] = 0. The key question for homogeneous
systems is uniqueness whether ker L = {0}, in which case 0 is the only solution, or
whether there are nontrivial solutions.
In the matrix case, the compatibility of an inhomogeneous system A x = b which
was required for the existence of a solution led to the general definition of the range of
a matrix, which we copy here.
Definition 7.27. The range of a linear function L: V W is the subspace
rng L = { L[ v ] | v V } W.
The proof that rng L is a subspace is straightforward. If f = L[ v ] and g = L[ w ] are
any two elements of the range, so is any linear combination, since, by linearity
c f + d g = c L[ v ] + d L[ w ] = L[ c v + d w ] rng L.
For example, if L[ v ] = A v is given by multiplication by an m n matrix, then its range
is the subspace rng L = rng A R m spanned by the columns of A the column space of
the coefficient matrix.
The fundamental theorem regarding solutions to inhomogeneous linear equations exactly mimicks our earlier result, Theorem 2.38, for matrix systems.
Theorem 7.28. Let L: V W be a linear function. Let f W . Then the inhomogeneous linear system
L[ u ] = f
(7.31)
has a solution if and only if f rng L. In this case, the general solution to the system has
the form
u = u? + z
(7.32)
3/7/03

241

c 2003

Peter J. Olver

where u? is a particular solution, so L[ u? ] = f , and z is a general element of ker L, i.e.,


the general solution to the homogeneous system
L[ z ] = 0.

(7.33)

Proof : We repeat the proof of Theorem 2.38. The condition f rng L is an immediate
consequence of the definition of the range. Suppose u? is a particular solution to (7.31).
If z is a solution to (7.33), then, by linearity,
L[ u? + z ] = L[ u? ] + L[ z ] = f + 0 = f ,
and hence u? + z is also a solution to (7.31). To show that every solution has this form,
let u be a second solution, so that L[ u ] = f . Then
L[ u u? ] = L[ u ] L[ u? ] = f f = 0.
Therefore u u? = z ker L is a solution to (7.33).

Q.E.D.

Remark : In physical systems, the inhomogeneity f typically corresponds to an external


forcing function. The solution to the homogeneous system represents the systems natural,
unforced motion. Therefore, the decomposition formula (7.32) states that a linear system
responds to an external force as a combination of its own internal motion and a specific
motion induced by the forcing. Examples of this important principle appear throughout
the book.
Corollary 7.29. The inhomogeneous linear system (7.31) has a unique solution if
and only if f rng L and ker L = {0}.
Therefore, to prove that a linear system has a unique solution, we first need to prove
an existence result that there is at least one solution, which requires the right hand side f
to lie in the range of the operator L, and then a uniqueness result, that the only solution
to the homogeneous system L[ z ] = 0 is the trivial zero solution z = 0. Consequently, if
an inhomogeneous system L[ u ] = f has a unique solution, then any other inhomogeneous
system L[ u ] = g with the same linear function, also has a unique solution whenever
g rng L.
Example 7.30. Consider the inhomogeneous linear second order differential equation
u00 + u0 2 u = x.

(7.34)

Note that this can be written in the linear system form


L[ u ] = x,

where

L = D2 + D 2

is a linear second order differential operator. The kernel of the differential operator L is
found by solving the associated homogeneous linear equation
L[ z ] = z 00 + z 0 2 z = 0.
3/7/03

242

c 2003

Peter J. Olver

Applying the usual solution method, we find that the homogeneous differential equation
has a two-dimensional solution space, with basis functions
z1 (x) = e 2 x ,

z2 (x) = ex .

Therefore, the general element of ker L is a linear combination


z(x) = c1 z1 (x) + c2 z2 (x) = c1 e 2 x + c2 ex .
To find a particular solution to the inhomogeneous differential equation, we rely on
the method of undetermined coefficients . We introduce the solution ansatz u = a x + b,
and compute
L[ u ] = L[ a x + b ] = 2 a x 2 b + a = x.
Equating the two expressions, we conclude that a = 1, b = 12 , and hence
u? (x) = x +

1
2

is a particular solution to (7.34). Theorem 7.28 then says that the general solution to the
inhomogeneous differential equation (7.34) is
u(x) = u? (x) + z(x) = x +

1
2

+ c 1 e 2 x + c 2 ex .

Example 7.31. By inspection, we see that


u(x, y) = 12 sin(x + y)
is a solution to the Poisson equation
2u 2u
+ 2 = sin(x + y).
x2
y

(7.35)

Theorem 7.28 implies that every solution to this inhomogeneous version of the Laplace
equation takes the form
u(x, y) = 21 sin(x + y) + z(x, y),
where z(x, y) is an arbitrary solution to the homogeneous Laplace equation (7.29).
Example 7.32. The problem here is to solve the linear boundary value problem
u00 + u = x,

u(0) = 0,

u() = 0.

(7.36)

The first step is to solve the differential equation. First, we find that cos x and sin x
form a basis for the solution space to the corresponding homogeneous differential equation

One could also employ the method of variation of parameters, although in general the undetermined coefficient method, when applicable, is the more straightforward of the two. See
[ 20 ].

3/7/03

243

c 2003

Peter J. Olver

z 00 + z = 0. The method of undetermined coefficients then produces the particular solution


u? = x to the inhomogeneous differential equation, and so the general solution is
u(x) = x + c1 cos x + c2 sin x.

(7.37)

The next step is to see whether any solutions also satisfy the boundary conditions. Plugging
formula (7.37) into the boundary conditions gives
u(0) = c1 = 0,

u() = c1 = 0.

However, these two conditions are incompatible, and so there is no solution to the linear
system (7.36). The function f (x) = x does not lie in the range of the differential operator
L[ u ] = u00 + u when u is subjected to the boundary conditions.
On the other hand, if we change the inhomogeneity, the boundary value problem
u00 + u = x 12 ,

u(0) = 0,

u() = 0.

(7.38)

does admit a solution, but the solution fails to be unique. Applying the preceding solution
method, we find that the function
u(x) = x 21 + 21 cos x + c sin x
solves the system for any choice of constant c. Note that z(x) = sin x forms a basis for the
kernel or solution space of the homogeneous boundary value problem
z 00 + z = 0,

z(0) = 0,

z() = 0.

Incidentally, if we slightly modify the interval of definition, considering



u00 + u = f (x),
u(0) = 0,
u 21 = 0,

(7.39)

then the system is compatible for any inhomogeneity f (x), and the solution to the boundary value problem is unique. For example, if f (x) = x, then the unique solution is
u(x) = x 21 sin x .

(7.40)

This example highlights some major differences between boundary value problems
and initial value problems for ordinary differential equations. For nonsingular initial value
problems, there is a unique solution for any set of initial conditions. For boundary value
problems, the structure of the solution space either a unique solution for all inhomogeneities, or no solution, or infinitely many solutions, depending on the right hand side
has more of the flavor of what we learned for linear matrix systems. An interesting
question is how to characterize the inhomogeneities f (x) that admit a solution, i.e., lie in
the range of the operator. We will return to this question in Chapter 10.
Superposition Principles for Inhomogeneous Systems
The superposition principle for inhomogeneous linear equations allows us to combine
different inhomogeneities provided we do not change the underlying linear operator.
3/7/03

244

c 2003

Peter J. Olver

Theorem 7.33. Let L: V W be a prescribed linear function. Suppose that, for


each i = 1, . . . , k, we know a particular solution u?i to the inhomogeneous linear system
L[ u ] = f i . Then, given scalars c1 , . . . , ck , a particular solution to the combined inhomogeneous system
L[ u ] = c1 f 1 + + ck f k
(7.41)
is the same linear combination u? = c1 u?1 + +ck u?k of particular solutions. Then general
solution to (7.41) is
u = u? + z = c1 u?1 + + ck u?k + z,
where z is the general solution to the associated homogeneous system L[ z ] = 0.
The proof is an easy consequence of linearity, and left to the reader. In physical
terms, the superposition principle can be interpreted as follows. If we know the response
of a linear physical system to several different external forces, represented by f 1 , . . . , fk ,
then the response of the system to a linear combination of these forces is just the same
linear combination of the individual responses. The homogeneous solution z represents an
internal motion that the system acquires independent of any external forcing. Superposition requires linearity of the system, and so is always applicable in quantum mechanics,
which is a linear theory, but in classical and relativistic mechanics applies only in a linear
approximation regime corresponding to small motions/displacements/etc. The nonlinear
regime is much more complicated, and the combination of different forces may lead to
unexpected results.
Example 7.34. We already know that a particular solution to the linear differential
equation
u00 + u = x,
is u?1 = x. The method of undetermined coefficients is used to solve the inhomogeneous
equation
u00 + u = cos x.
Since cos x and sin x are already solutions to the homogeneous equation, we must use
the solution ansatz u = a x cos x + b x sin x, which, when substituted into the differential
equation, produces to particular solution u?2 = 12 x sin x. Therefore, by the superposition
principle, the inhomogeneous system
u00 + u = 3 x 2 cos x
has a particular solution
u? = 3 u?1 2 u?2 = 3 x + x sin x.
The general solution is obtained by appending the general solution to the homogeneous
equation,
u = 3 x + x sin x + c1 cos x + c2 sin x.
3/7/03

245

c 2003

Peter J. Olver

Example 7.35. Consider the boundary value problem



u00 + u = x,
u(0) = 2,
u 2 = 1,

(7.42)

which is a modification of (7.39) with inhomogeneous boundary conditions. The superposition principle applies here, and allows us to decouple the inhomogeneity due to the
forcing from the inhomogeneity due to the boundary conditions. We already solved the
boundary value problem with homogeneous boundary conditions; see (7.40). On the other
hand, the unforced boundary value problem

(7.43)
u00 + u = 0,
u(0) = 2,
u 2 = 1,
has unique solution

u(x) = 2 cos x sin x.

(7.44)

Therefore, the solution to the combined problem (7.42) is the sum of these two:

u(x) = x + 2 cos x 1 + 2 sin x .

The solution is unique because the corresponding homogeneous boundary value problem

z 00 + z = 0,
z(0) = 0,
z 2 = 0,

has only the trivial solution z(x) 0. Incidentally, the solution (7.44) can itself be
decomposed as a linear combination of the solutions cos x and sin x to a pair of yet more
elementary boundary
problems with just one inhomogeneous
boundary condition;
value


namely, u(0) = 1, u 2 = 0, and, respectively, u(0) = 0, u 2 = 1.
Complex Solutions to Real Systems

The best way to obtain solutions to linear, homogeneous, constant coefficient ordinary
differential equations, is through an exponential ansatz, leading to the characteristic equation. Complex roots of the characteristic equation give complex exponentials, but, if the
equation is real, then the real and imaginary parts of the complex solutions are automatically real solutions. However, if the equation has complex coefficients, then this device
cannot be used, and the equation typically has no real solutions. This solution method
is a particular case of a general principle, that produces real solutions to real linear systems from complex solutions. We require some additional structure on the complex vector
spaces involved in order to establish the result.
Definition 7.36. A complex vector space V is called conjugated if it admits an
operation of complex conjugation u 7 u that is compatible with scalar multiplication. In
other words, if u V and C, then we require u = u.
The simplest example of a conjugated vector space is C n . The complex conjugate of
a vector is obtained by conjugating all its entries. Thus we have
u = v + i w,
u = v i w,
3/7/03

where

v = Re u =
246

u+u
,
2

w = Im u =

uu
,
2i

c 2003

(7.45)

Peter J. Olver

are the real and imaginary parts of u C n . The same definition of real and imaginary part
carries over to general conjugated vector spaces. Another good example of a conjugated
vector space is the space of complex-valued functions f (x) = r(x) + i s(x) defined on the
interval a x b. The complex conjugate function is f (x) = r(x) i s(x).
A vector v V of a conjugated vector space is called real if v = v. One easily checks
that the real and imaginary parts of a general vector, as defined by (7.45), are both real
vectors.
Definition 7.37. A linear operator L: V W between conjugated vector spaces is
called real if it commutes with complex conjugation:
L[ u ] = L[ u ].

(7.46)

For example, the linear function F : C n C m given by matrix multiplication, F (u) =


A u, is real if and only if A is a real matrix. Similarly, a differential operator (7.14) is real
if its coefficients are real-valued functions.
Theorem 7.38. If L[ u ] = 0 is a real homogeneous linear system and u = v + i w is
a complex solution, then its complex conjugate u = v i w is also a solution. Moreover,
both the real and imaginary parts, v and w, of a complex solution are real solutions.
Proof : First note that, by reality,
L[ u ] = L[ u ] = 0

whenever

L[ u ] = 0,

and hence the complex conjugate u of any solution is also a solution. Therefore, by linear
superposition, v = Re u = 12 u+ 21 u and w = Im u = 21i u 21i u are also solutions. Q.E.D.
Example 7.39. The real linear matrix system

x

2 1 3 0 y
0
=
2 1 1 2
z
0
w

has a complex solution

1 3 i
1
3
1

1
0
u=
=
+ i
.
2 + 2i
2
2
2 2 i
2
2

Since the coefficient matrix is real, the real and imaginary parts,
T

v = ( 1, 1, 2, 2 ) ,

w = ( 3, 0, 2, 2 ) ,

are both solutions.


On the other hand, the complex linear system

3/7/03

2
1+ i

2 i
0

i
1 i
247


x

0 y
0
=
1
z
0
w
c 2003

Peter J. Olver

has the complex solution


1 i
1
i 0
u=
= +
2
2
1 + 2i
1

1
1
i
.
0
2

However, neither the real nor the imaginary part is a solution to the system.
Example 7.40. Consider the real ordinary differential equation
u00 + 2 u0 + 5 u = 0.
To solve it, as in Example 7.23, we use the exponential ansatz u = e x , leading to the
characteristic equation
2 + 2 + 5 = 0.
There are two roots,
1 = 1 + 2 i ,

2 = 1 + 2 i ,

leading, via Eulers formula (3.74), to the complex solutions


u1 (x) = e1 x = e x cos 2 x + i e x sin 2 x,

u2 (x) = e2 x = e x cos 2 x i e x sin 2 x.

The complex conjugate of the first solution is the second, in accordance with Theorem 7.38.
Moreover, the real and imaginary parts of the two solutions
v(x) = e x cos 2 x,

w(x) = e x sin 2 x,

are individual real solutions. The general solution is a linear combination


u(x) = c1 e x cos 2 x + c2 e x sin 2 x,
of the two linearly independent real solutions.
Example 7.41. Consider the second order Euler differential equation
L[ u ] = a x2 u00 + b x u0 + c u = 0,
where the coefficients 0 6= a, b, c are real constants. If the roots of the characteristic
equation
a r (r 1) + b r + c = a r 2 + (b a) r + c = 0.
are complex, r = s i t, then the solutions xr = xs i t are complex powers. Using Eulers
formula (3.74), we write them in real and imaginary form, e.g.,
xs+ i t = xs e i t log x = xs cos(t log x) + i xs sin(t log x).
Again, by Theorem 7.38, the real and imaginary parts of the complex solution are by
themselves real solutions to the equation. Therefore, we find the general real solution
u(x) = c1 | x |s cos(t log | x |) + c2 | x |s sin(t log | x |)
to the Euler equation in this situation.
3/7/03

248

c 2003

Peter J. Olver

Example 7.42. The complex monomial


u(x, y) = (x + i y)n
is a solution to the Laplace equation (7.29) because, by the chain rule,
2u
= n(n 1)(x + i y)n2 ,
x2

2u
= n(n 1) i 2 (x + i y)n2 = n(n 1)(x + i y)n2 .
y 2

Since the Laplace operator is real, Theorem 7.38 implies that the real and imaginary parts
of this complex solution are real solutions. The real solutions are known as harmonic
polynomials.
To find the explicit formulae for the harmonic polynomials, we use the Binomial
Formula (C.4) and the fact that i 2 = 1, i 3 = i , i 4 = 1, etc., to expand


n n3 3 3
n n2 2 2
n
n
n1
x
i y +
x
i y +
(x + i y) = x + n x
iy +
3
2


n n3 3
n n2 2 2
n
n1
x
y + .
x
i y i
= x + i nx
y
3
2
Separating the real and imaginary terms, we find


n n2 2
n n4 4
n
n
Re (x + i y) = x
x
y +
x
y + ,
2
4


n n3 3
n n5 5
n
n1
Im (x + i y) = n x
y
x
y +
x
y + .
3
5

(7.47)

The first few of the harmonic polynomials appear in Example 7.26. In fact, every polynomial solution to the Laplace equation is a linear combination of the fundamental real
harmonic polynomials. See Chapter 15 for further details.

7.3. Adjoints.
In Sections 2.5 and 5.6, we learned the importance of the adjoint system A T y = f ,
based for understanding systems of linear equations A x = b. Two of the four fundamental
matrix subspaces are based on the adjoint system. We note that the mn matrix A defines
a linear function from R n to R m . Its transpose, AT , has size nm and hence characterizes
a linear function in the reverse direction, from R m to R n .
As with most fundmental concepts for linear matrix systems, the adjoint system and
transpose operation on the coefficient matrix are the prototypes of a more general construction that is valid for general linear systems. However, it is far from obvious how to
transpose a more general linear operator L[ u ], e.g., a differential operator acting on
function space. In this section, we shall introduce the concept of the adjoint of a linear
function that generalizes the transpose operation on matrices. The adjoint (and transpose)
relies on an inner product structure on both the domain and target spaces. For simplicity,
we restrict our attention to real inner product spaces, leaving the complex version to the
interested reader to sort out.
3/7/03

249

c 2003

Peter J. Olver

We begin with a linear function L: V W that maps an inner product space V to


a second inner product space W . We distinguish the inner products on V and W (which
may be different even when V and W are the same vector space) by using a single angle
bracket
ei
e V,
hv;v
to denote the inner product between
v, v

and a double angle bracket


e ii
hh w ; w

to denote the inner product between

e W.
w, w

With the prescription of inner products on both the domain and target spaces, the abstract
definition of the adjoint of a linear function can be formulated.
Definition 7.43. Let V, W be inner product spaces, and let L: V W be a linear
function. The adjoint of L is the function L : W V that satisfies
hh L[ v ] ; w ii = h v ; L [ w ] i

for all

v V,

w W.

(7.48)

Note that the adjoint function goes in the opposite direction to L, just like the transposed matrix. Also, the left hand side of equation (7.48) indicates the inner product on W ,
while the right hand side is the inner product on V , which is where the respective vectors
live.
Lemma 7.44. The adjoint of a linear function is a linear function.
Proof : Given v V , w, z W , and scalars c, d R, we find
h v ; L [ c w + d z ] i = hh L[ v ] ; c w + d z ii = c hh L[ v ] ; w ii + d hh L[ v ] ; z ii
= c h v ; L [ w ] i + d h v ; L [ z ] i = h v ; c L [ w ] + d L [ z ] i.
Since this holds for all v V , we must have
L [ c w + d z ] = c L [ w ] + d L [ z ],
proving linearity.

Q.E.D.

The proof of the next result is left as an exercise.


Lemma 7.45. The adjoint of the adjoint of L is just (L ) = L.
Remark : In infinite-dimensional situations, the adjoint may not exist. But if it does,
then it is uniquely determined by (7.48); see Exercise .
Example 7.46. Let us show how the adjoint definition (7.48) leads directly to the
transpose of a matrix. Let L: R n R m be a linear function given by matrix multiplication
by the m n matrix A. Then L : R m R n is linear, and so is represented by matrix
multiplication by an n m matrix A . We use the Euclidean dot products
e,
e i = vT v
hv;v

3/7/03

e Rn,
v, v

250

e
e ii = wT w,
hh w ; w

e Rm,
w, w

c 2003

Peter J. Olver

on both R n and R m . Evaluation of both sides of the adjoint equation (7.48) gives
hh L[ v ] ; w ii = hh A v ; w ii = (A v)T w = vT AT w,
h v ; L [ w ] i = h v ; A w i = vT A w.

(7.49)

Since these must agree for all v, w, the matrix A representing L is equal to the transposed matrix AT . (See Exercise .) Therefore, the adjoint of a matrix with respect to the
Euclidean inner product is its transpose: A = AT .
Example 7.47. Let us now adopt different, weighted inner products on the domain
and target spaces for L: R n 7 R m . Suppose that
b i = vT M v
b , while
the inner product on R n is given by h v ; v
m
T
b ii = w C w,
b
the inner product on R is given by hh w ; w

where M > 0 and C > 0 are positive definite matrices of respective sizes m m and n n.
Then, in place of (7.49), we have
h v ; A w i = vT M A w.
Equating these expressions, we deduce that AT C = M A . Thereforem the weighted
adjoint of the matrix A is given by the more complicated formula
hh A v ; w ii = (A v)T C w = vT AT C w,

A = M 1 AT C.

(7.50)

As we shall see, everything we know about transposes can be reinterpreted in terms


of adjoints. The next result generalizes the fact, (1.55), that the transpose of the product
of two matrices is the product of the transposes, in the reverse order.
Lemma 7.48. If L: V W and M : W Z have respective adjoints L : W V
and M : Z W , then the composite linear function M L: V Z has adjoint (M L) =
L M , which maps Z to V .
Proof : For v V , z Z, we compute
h v ; (M L) [ z ] i = hhh (M L)[ v ] ; z iii = hhh M [ L[ v ] ] ; z iii
= hh L[ v ] ; M [ z ] ii = h v ; L [ M [ z ] ] i = h v ; (L M )[ z ] i.
Here hhh z ; e
z iii denotes the inner product on Z. This completes the proof.

(7.51)
Q.E.D.

So far, we have only looked at adjoints in the finite-dimensional situation, when the
linear functions are given by matrix multiplication. The equally important case of adjoints
of linear operators on function spaces, e.g., differential operators appearing in boundary
value problems, will require additional analytical tools, and will be a principal focus of
Section 10.3.
SelfAdjoint and Positive Definite Linear Functions
We can also generalize the notions of symmetric and positive definite matrices in a
natural fashion. First, a symmetric matrix equals its own transpose: A = A T . In a similar
fashion, we define a self-adjoint linear function. Throughout this section V is a fixed inner
product space.
3/7/03

251

c 2003

Peter J. Olver

Definition 7.49. A linear function K: V V is called self-adjoint if K = K.


Remark : Technically, this only defines the notion of formally self-adjoint. A truly
self-adjoint linear transformation satisfies additional analytical requirements in the infinitedimensional function space situation, [102]. However, to keep matters simple, we will avoid
the full analytical complications in this introductory treatment of the subject.
Definition 7.50. A linear map K: V V is called positive definite if it is self-adjoint
and, in addition,
h v ; K[ v ] i > 0
for all
0 6= v V.
(7.52)
The next result generalizes our basic observation that the Gram matrices A T A and
A CA, cf. (3.46), (3.48), define symmetric, positive definite matrices.
T

Theorem 7.51. If L: V W is a linear map between inner product spaces, then


the composite map K = L L: V V is self-adjoint. Moreover, K > 0 is positive definite
if and only if ker L = {0}.
Proof : First, by Lemmas 7.48 and 7.45,
K = (L L) = L (L ) = L L = K,
proving self-adjointness. Furthermore, for v V ,
h v ; K[ v ] i = h v ; L [ L[ v ] ] i = h L[ v ] ; L[ v ] i = k L[ v ] k2 > 0.
provided L[ v ] 6= 0. Thus, if ker L = {0}, then the positivity condition (7.52) holds for all
v 6= 0, and conversely.
Q.E.D.
Example 7.52. Consider the case of a linear function L: R n R m that is represented by the m n matrix A. If we use the Euclidean dot product on the two spaces,
then L is represented by the transpose AT , and hence the map K = L L has matrix
representation AT A. Therefore, in this case Theorem 7.51 reduces to our earlier result,
Proposition 3.32, governing the positive definiteness of the matrix product A T A.
More generally, if we use alternative inner products on the domain and target spaces,
as in Example 7.47, then, according to (7.50), the adjoint of L has matrix form
A = M 1 AT C,

K = A A = M 1 AT C A

and therefore

(7.53)

is a self-adjoint, positive definite matrix with respect to the weighted inner product
h v ; w i = vT M w

for

v, w R n .

(7.54)

Note that when we specialize to the usual dot product on R n , then M = I , and K = AT CA
is the symmetric, positive definite Gram matrix constructed in (3.48), that played a key
role in our developement of the equations of equilibrium in Chapter 6.
The case of more general M produces a self-adjoint, but no longer symmetric, positive
definite matrix (7.53). Thus, we did not quite tell the truth when we said we would only
allow symmetric matrices to be positive definite we really meant only self-adjoint matrices. The general case will be important in our discussion of the vibrations of mass/spring
3/7/03

252

c 2003

Peter J. Olver

chains that have nonequal masses. Extensions of these constructions to differential operators underlies the analysis of the static and dynamic differential equations of continuum
mechanics.
Minimization
In Chapter 4, we learned that the solution to a matrix system K u = f , with positive
definite coefficient matrix K = AT A > 0, can be characterized as the unique minimizer
for the quadratic function
p(u) = 12 uT K u uT f .
There is a completely analogous quadratic minimization principle that characterizes the
solution to general linear systems. This result is of tremendous importance in analysis of
boundary value problems for differential equations and also underlies a large class of numerical solution algorithms the finite element methods. Details will appear in subsequent
chapters.
Theorem 7.53. Let K: V V be a positive definite operator on an inner product
space V . If f rng K, then the quadratic function
p(u) =

1
2

h u ; K[ u ] i h u ; f i

(7.55)

has a unique minimizer u = u? , which is the solution to the linear system K[ u ] = f .


Proof : The proof mimics the proof of its matrix counterpart, Theorem 4.2. Since
f = K[ u? ], we can write
p(u) =

1
2

h u ; K[ u ] i h u ; K[ u? ] i =

1
2

h u u? ; K[ u u? ] i

1
2

h u? ; K[ u? ] i. (7.56)

where we used linearity and the fact that K is self-adjoint to identify the terms
h u ; K[ u? ] i = h u? ; K[ u ] i.
Since K > 0 is positive definite, the first term on the right hand side of (7.56) is always
0; moreover it equals its minimal value 0 if and only if u = u? . On the other hand, the
second term does not depend upon u at all, and hence has a fixed value. Therefore, to
minimize p(u) we must make the first term as small as possible, which is accomplished by
setting u = u? .
Q.E.D.
Remark : For linear functions given by matrix multiplication, positive definiteness
automatically implies invertibility, and so the linear system K u = f has a solution for
every right hand side. This is no longer necessarily true when K is a positive definite
operator on an infinite-dimensional function space. Therefore, the existence of a solution
or minimizer is a significant issue. And, in fact, proofs of rigorous analytical existence
results are often rely on the minimization principle!
Corollary 7.54. Suppose L: V W is a linear map between inner product spaces
with ker L = {0}. Let K = L L: V V be the associated positive definite operator. If
f rng K, then the quadratic function
p(u) =

1
2

k L[ u ] k2 h u ; f i

(7.57)

has a unique minimizer u = u? , which is the solution to the linear system K[ u ] = f .


3/7/03

253

c 2003

Peter J. Olver

Proof : It suffices to note that the quadratic term in (7.55) can be written in the
alternative form
h u ; K[ u ] i = h u ; L [ L[ u ] ] i = h L[ u ] ; L[ u ] i = k L[ u ] k2 .
Hence, the two quadratic functions (7.55), (7.55) are the same when K = L L. As a
result, Corollary 7.54 follows immediately from Theorem 7.53.
Q.E.D.
Warning: In (7.57), the first term k L[ u ] k2 is computed using the norm based on the
inner product on W , while the second term h u ; f i uses the inner product on V .
Example 7.55. For a generalized positive definite matrix (7.53), the quadratic funce i = vT M v
e,
tion (7.57) is computed with respect to the alternative inner product h v ; v
so
p(u) = 21 (A u)T C A u uT M f = 21 uT (AT C A)u uT (M f ).
Corollary 7.54 tells us that the minimizer of the quadratic function is the solution to
AT C A u = M f ,

or

K u = M 1 AT C A u = f .

This also follows from our earlier finite-dimensional minimization Theorem 4.2.
In Chapters 10, 14, and 17, we will see how the most important minimization principles for characterizing solutions to the linear boundary value problems of physics and
engineering all arise through this general, abstract construction.

7.4. Linear Transformations.


A linear function L: R n R n that maps n-dimensional Euclidean space to itself defines a linear transformation. As such, it can be assigned a geometrical interpretation that
leads to additional insight into the nature and scope of linear functions. The transformation L maps a point x R n to its image point L[ x ] = A x, where A is its n n matrix
representative. Many of the basic maps that appear in geometry, in computer graphics, in
deformations of elastic bodies, in symmetry and crystallography, and in Einsteins special
relativity, are defined by linear transformations. The two-, three- and four-dimensional
(viewing time as a fourth dimension) cases are of particular importance.
Most of the important, basic linear transformations already appear in the two-dimensional case R 2 . Every linear function L: R 2 R 2 has the form

x
ax + by
a b
L
=
,
where
A=
(7.58)
y
cx + dy
c d
is an arbitrary 2 2 matrix. We have already encountered the rotation matrices

cos sin
,
R =
sin
cos
3/7/03

254

c 2003

(7.59)

Peter J. Olver

Figure 7.4.

Figure 7.5.

Figure 7.6.

Rotation.

Reflection Through the y Axis.

Reflection Through the Diagonal.

whose effect is to rotate every vector in R 2 through an angle ; see Figure 7.4. Planar
rotation matrices coincide with the 2 2 proper orthogonal matrices, meaning matrices Q
that satisfy
QT Q = I ,
det Q = +1.
(7.60)
The improper orthogonal matrices, i.e., those with determinant 1, define reflections. For
example, the matrix

x
x
1 0
, (7.61)
=
corresponds to the linear transformation L
A=
y
y
0 1
which reflects the plane through the y axis; see Figure 7.5. It can be visualized by thinking
of the y axis as a mirror. Another simple example is the improper orthogonal matrix


0 1
x
y
R=
. The corresponding linear transformation L
=
(7.62)
1 0
y
x
is a reflection through the diagonal line y = x; see Figure 7.6.
A similar bipartite classification of orthogonal matrices carries over to three-dimensional
(and even higher dimensional) space. The proper orthogonal matrices correspond to rota3/7/03

255

c 2003

Peter J. Olver

Figure 7.7.

ThreeDimensional Rotation with Axis.

tions and the improper to reflections, or, more generally, reflections combined with rotations. For example, the proper orthogonal matrix

cos sin 0
sin cos 0
(7.63)
0
0
1

corresponds to a rotation through an angle around the zaxis, while

cos 0 sin
0
1
0
sin 0 cos

(7.64)

corresponds to a rotation through an angle around the yaxis. In general, a proper


orthogonal matrix Q = ( u1 u2 u3 ) with columns ui = Q ei corresponds to the rotation
in which the standard basis vectors e1 , e2 , e3 are rotated to new positions given by the
orthonormal basis u1 , u2 , u3 . It can be shown see Exercise that every 3 3
orthogonal matrix corresponds to a rotation around a line through the origin in R 3 the
axis of the rotation; see Figure 7.7.
Since the product of two (proper) orthogonal matrices is also (proper) orthogonal,
this implies that the composition of two rotations is also a rotation. Unlike the planar
case, the order in which the rotations are performed is important! Multiplication of n n
orthogonal matrices is not commutative for n 3. For example, rotating first around the
zaxis and then rotating around the yaxis does not have the same effect as first rotating
around the yaxis and then rotating first around the zaxis. If you dont believe this, try
it out with a solid object, e.g., this book, and rotate through 90 around each axis; the
final configuration of the book will depend upon the order in which you do the rotations.
Then prove this mathematically by showing that the two rotation matrices (7.63), (7.64)
do not commute.
Other important linear transformations arise from elementary matrices. First, the
elementary matrices corresponding to the third type of row operations multiplying a
row by a scalar correspond to simple stretching transformations. For example, if

2x
x
2 0
=
,
then the linear transformation
L
A=
y
y
0 1
3/7/03

256

c 2003

Peter J. Olver

Figure 7.8.

Figure 7.9.

Stretch Along the xaxis.

A Shear in the x Direction.

has the effect of stretching along the x axis by a factor of 2; see Figure 7.8. A matrix with
a negative diagonal entry corresponds to a reflection followed by a stretch. For example,
the elementary matrix (7.61) gives an example of a pure reflection, while the more general
elementary matrix

1 0
2 0
2 0
=
0 1
0 1
0 1
can be written as the product of a reflection through the y axis followed by a stretch along
the x axis. In this case, the order these are performed in is immaterial.
For 22 matrices, there is only one type of row interchange matrix, namely the matrix
(7.62) that yields a reflection through the diagonal y = x. The elementary matrices of Type
#1 correspond to shearing transformations of the plane. For example, the matrix

1 2
x
x + 2y
represents the linear transformation
L
=
,
0 1
y
y
which has the effect of shearing the plane along the xaxis. The constant 2 will be called
the shear factor , which can be either positive or negative. Each point moves parallel to the
x axis by an amount proportional to its distance from the axis; see Figure 7.9.. Similarly,
the elementary matrix

x
x
1 0
,
=
represents the linear transformation
L
y 3x
y
3 1
which represents a shear along the y axis. hears map rectangles to parallelograms; distances
are altered, but areas are unchanged.
All of the preceding linear maps are invertible, and so represented by nonsingular
matrices. Besides the zero map/matrix, which sends every point x R 2 to the origin, the
3/7/03

257

c 2003

Peter J. Olver

simplest singular map is

1 0
corresponding to the linear transformation
0 0


x
x
L
=
,
y
0
T

which is merely the orthogonal projection of the vector ( x, y ) onto the xaxis. More
general rank one matrices represent different sorts of projections mapping the plane R 2 to
a line through the origin; see Exercise for details.
A similar classification of linear maps appears in higher dimensions. The linear transformations constructed from elementary matrices can be built up from the following four
basic types:
(i ) A stretch in a single coordinate direction.
(ii ) A reflection through a coordinate plane.
(iii ) A reflection through a diagonal plane,
(iv ) A shear along a coordinate axis.
Moreover, we already proved that every nonsingular matrix can be written as a product
of elementary matrices; see (1.47). This has the remarkable consequence that every linear
transformation can be constructed from a sequence of elementary stretches, reflections, and
shears. In addition, there is one further, non-invertible type of basic linear transformation:
(v ) An orthogonal projection onto a lower dimensional subspace.
All possible linear transformations of R n can be built up. albeit non-uniquely, as a combination of these five basic types.
Example 7.56. Consider the matrix

A=

3
2
1
2

12

3
2

corresponding to a rotation through = 30 , cf. (7.59). Rotations are not elementary


linear transformations. To express this particular rotation as a product of elementary
matrices, we need to perform a Gauss-Jordan row reduction to reduce it to the identity
matrix. Let us indicate the basic steps:

!
!

3
12
1
0
2
E1 =
,
,
E1 A =
2
13 1
0
3

!
3
1

1 0
2
2

E2 =
,
E 2 E1 A =
,
0 23
0
1

!
!

2
1
0
1

3
3
E3 =
,
,
E 3 E2 E1 A =
0
1
0 1

1 13
1 0
E4 =
,
E 4 E3 E2 E1 A = I =
,
0 1
0 1
3/7/03

258

c 2003

Peter J. Olver

and hence
A = E11 E21 E31 E41 =

1
1
3

0
1

2
3

3
2

13

Therefore, a 30 rotation can be effected by performing the following composition of elementary transformations in the prescribed order
(1) A shear in the xdirection with shear factor 13 ,

(2) A stretch in the direction of the xaxis by a factor of 23 ,


(3) A stretch (contraction) in the y-direction by the reciprocal factor
(4) A shear in the direction of the yaxis with shear factor

2 ,
3

1 .
3

The fact that the combination of these special transformations results in a pure rotation
is surprising and non-obvious. Similar decompositions apply in higher dimensions.
Change of Coordinates
Sometimes a linear transformation represents an elementary geometrical transformation, but this is not evident because the matrix happens to be written in the wrong
coordinates. The characterization of linear functions from R n to R m as multiplication by
m n matrices in Theorem 7.5 relied on the use of the standard bases of the domain and
target spaces. In many cases, the standard basis is not particularly well adapted to the
linear transformation, and one tries to work in a more comfortable basis. So, the question
arises as to how to write a given linear transformation in a new basis.
The following general result says that, in any basis, a linear function on finitedimensional vector spaces can be realized by matrix multiplication of the coordinates.
But the particular matrix representative will depend upon the choice of basis.
Theorem 7.57. Let L: V W be a linear function. Suppose V has basis v1 , . . . , vn
and W has basis w1 , . . . , wm . We can write
v = x 1 v1 + + x n vn ,

w = y 1 w1 + + y m wm ,

where x = ( x1 , x2 , . . . , xn ) are the coordinates of v relative to the chosen basis on V and


T
y = ( y1 , y2 , . . . , ym ) are those of w relative to its basis. Then w = L[v] is given in these
coordinates by multiplication, y = B x, by an m n matrix B.
Proof : We mimic the proof of Theorem 7.5, replacing the standard basis vectors by
more general basis vectors. In other words, we should apply L to the basis vectors of V
and express the result as a linear combination of the basis vectors in W . Specifically, we
m
X
write L[ vj ] =
bij wi . The coefficients bij form the entries of the desired coefficient
i=1

matrix, as the reader can readily verify.


3/7/03

Q.E.D.

259

c 2003

Peter J. Olver

In particular, if we are given a linear transformation L: R n R m , it will be represented by a particular m n matrix A relative to the standard bases e 1 , . . . , en and
b
e1 , . . . , b
em of the domain and target spaces. If we introduce new bases for R n and R m
then the same linear transformation may have a completely different matrix representation.
Therefore, different matrices can represent the same underlying linear transformation, with
respect to different bases.
Example 7.58. Consider the linear transformation

x
xy
L
=
y
2x 4y
2
which we write in the standard,
Cartesian coordinates x, y on R . The corresponding

1 1
is the matrix representation of L relative to the
coefficient matrix A =
2 4
standard basis e1 , e2 of R 2 , meaning that


1
1
= e1 + 4 e2 .
= e1 + 2 e 2 ,
L[ e2 ] =
L[ e1 ] =
4
2

Let us see what happens if we replace the standard basis by a different basis

1
1
.
,
v2 =
v1 =
2
1
What is the corresponding matrix formulation of the same linear transformation? According to the basic recipe of Theorem 7.57, we must compute

3
2
= 3 v2 .
= 2 v1 ,
L[ v2 ] =
L[ v1 ] =
6
2
The linear transformation acts by stretching in the direction v1 by a factor of 2 and
simultaneously stretching in the direction v2 by a factor of 3. Therefore,

the matrix form


2 0
of L with respect to this new basis is the diagonal matrix D =
. In general, then,
0 3
L[ a v1 + b v2 ] = 2 a v1 + 3 b v2 ,
T

which is just the effect of multiplying the new coordinates a = ( a, b ) by the diagonal
matrix D. The simple geometry of this linear transformation is thereby exposed through
the choice of an adapted basis.
How does one effect a change of basis in general? According to (2.23), if v 1 , . . . , vn
T
form a new basis of R n , then the coordinates y = ( y1 , y2 , . . . , yn ) of a vector
x = y 1 v1 + y2 v2 + + y n vn
are found by solving the linear system
S y = x,
3/7/03

where
260

S = ( v 1 v2 . . . v n )

c 2003

(7.65)
Peter J. Olver

is the nonsingular n n matrix whose columns are the basis vectors.


Consider first a linear transformation L: R n R n from R n to itself. When written
in terms of the standard basis, L[ x ] = A x has a certain n n coefficient matrix A. To
change to the new basis v1 , . . . , vn , we use(7.65) to rewrite the x coordinates in terms of
the new y coordinates. We also need to write the target vector f = A x in terms of the new
coordinates, which requires f = S g. Therefore, the new target coordinates are expressed
in terms of the new domain coordinates via
g = S 1 f = S 1 A x = S 1 A S y = B y.
Therefore, in the new basis, the matrix form of our linear transformation is
B = S 1 A S.

(7.66)

Two matrices related by such an equation are called similar . Thus similar matrices represent the same linear transformation, reative to different bases of R n .
Returning to the preceding
we assemble the new basis vectors to form the
example,
1
1
, and verify that
change of basis matrix S =
1 2

2
1
1 1
1 1
2 0
1
S AS =
=
= D.
1 1
2 4
1 2
0 3

More generally, a linear transformation L: R n R m is represented by an m n


matrix A with respect to the standard bases. What happens if we introduce a new basis
v1 , . . . , vn on the domain space R n and a new basis w1 , . . . , wm on the target space R m ?
Arguing as above, we conclude that the matrix representative of L with respect to these
new bases is given by
B = T 1 A S,
(7.67)
where S = ( v1 v2 . . . vn ) is the domain basis matrix, while T = ( w1 w2 . . . wm ) is the
range basis matrix.
In particular, if an linear transformation has rank
r = dim rng L = dim corng L,
and we choose a basis v1 , . . . , vn of R n such that v1 , . . . , vr form a basis of corng L while
vr+1 , . . . , vn form a basis for ker L = (corng L) . According to Proposition 5.48, the
image vectors w1 = L[ v1 ], . . . , wr = L[ vr ] form a basis for rng L. We further choose a
basis wr+1 , . . . , wm for coker L = (rng L) , and note that the combination w1 , . . . , wm
forms a basis for R m . The matrix form of L relative to these particular bases is simply

1 0 0 ... 0 0 ... 0
0 1 0 ... 0 0 ... 0

0 0 1 ... 0 0 ... 0

. . . .

.. 1 0 . . . 0
(7.68)
B = .. .. ..
.

0 0 0 ... 0 0 ... 0

. . . .
. . ... ... . . . ...
.. .. ..
0 0 0 ... 0 0 ... 0
3/7/03

261

c 2003

Peter J. Olver

In this matrix, the first r rows have a single 1 in the diagonal slot, indicating that the
while the last m r rows are all zero. Thus, by a suitable choice of bases on both the
domain and target spaces, any linear transformation has a particularly simple canonical
form.
Example 7.59. According to the illustrative example following Theorem 2.47, the
matrix

2 1 1
2
A = 8 4 6 4
4 2 3
2
has rank 2. Based on the calculations, we choose the domain space basis
1

0
0
v2 =
,
2
4

2
1
v1 =
,
1
2

2
0
v4 =
,
2
1

1
v 3 = ,
0
0

where v1 , v2 are a basis for the row space corng A, while v3 , v4 are a basis for ker A. For
our basis of the target space, we first compute w1 = A v1 and w2 = A v2 , which form a
basis for rng A. We supplement these by the single basis vector w3 for coker A, and so

10
w1 = 34 ,
17

6
w2 = 4 ,
2

In terms of these two bases, the canonical matrix

1
1

B = T AS = 0
0

where

2
1

S=
1
2

0
0
2
4

1
2

1
0
0

2
0
,
2
1

w3 = 12 ,
1

form of the linear function is

0 0 0
1 0 0 ,
0 0 0

10
T = 34
17

6
4
2

0
1
2

as the reader can verify directly if desired.

7.5. Affine Transformations and Isometries.


Not every transformation of importance in geometrical applications arises as a linear
function. A simple example is a translation, where all the points in R n are moved in the
same direction by a common distance. The function that does this is
T [ x ] = x + a,
3/7/03

262

x Rn,

(7.69)
c 2003

Peter J. Olver

where a R n is a fixed vector that determines the direction and the distance that the
points are translated. Except in the trivial case a = 0, the translation T is not a linear
function because
T [ x + y ] = x + y + a 6= T [ x ] + T [ y ] = x + y + 2 a.
Or, even more simply, one notes that T [ 0 ] = a 6= 0.
Combining translations and linear functions leads us to an important class of geometrical transformations.
Definition 7.60. A function F : R n R m of the form
F [ x ] = A x + b,

(7.70)

where A is an m n matrix and b R n a fixed vector, is called an affine function.


For example, every affine function from R to itself has the form
f (x) = x + .

(7.71)

As mentioned earlier, even though the graph of f (x) is a straight line, f is not a linear
function unless = 0, and the line goes through the origin.
Example 7.61. The affine function

y + 1
1
x
0 1
=
+
F (x, y) =
x2
2
y
1 0
has the effect of first rotating the plane R 2 by 90 about the origin, and then translating
T
by the vector ( 1, 2 ) . The reader may enjoy proving that this combination has the same

effect as just rotating the plane through an angle of 90 centered at the point 43 , 12 .
See Exercise .
The composition of two affine functions is again an affine function. Specifically, given
F [ x ] = A x + a,

G[ y ] = B y + b,

then
(G F )[ x ] = G[ F [ x ] ] = G[ A x + a ]
= B (A x + a) + b = C x + c,

where

C = B A,

c = B a + b. (7.72)

Note that the coefficient matrix of the composition is the product of the coefficient matrices,
but the resulting vector of translation is not the sum the two translation vectors!
Isometry
A transformation that preserves distance is known as a rigid motion, or, more abstractly, as an isometry. We already encoutered the basic rigid motions in Chapter 6
they are the translations and the rotations.
3/7/03

263

c 2003

Peter J. Olver

Definition 7.62. A function F : V V is called an isometry on a normed vector


space if it preserves the distance between vectors:
d(F [ v ], F [ w ]) = d(v, w)

for all

v, w V.

(7.73)

Since the distance between points is just the norm of the vector between them,
d(v, w) = k v w k, cf. (3.26), the isometry condition (7.73) can be restated as
k F[v] F[w] k = k v w k

for all

v, w V.

(7.74)

Clearly, any translation


T [ v ] = v + a,

where

aV

is a fixed vector

defines an isometry. A linear transformation L: V V defines an isometry if and only if


k L[ v ] k = k v k

for all

v V,

(7.75)

because, by linearity, L[ v w ] = L[ v ] L[ w ]. More generally, an affine transformation


F [ v ] = L[ v ] + a is an isometry if and only if its linear part L[ v ] is.
For the standard Euclidean norm on V = R n , the linear isometries consist of rotations and reflections. Both are characterized by orthogonal matrices, the rotations having
determinant + 1, while the reflections have determinant 1.
Proposition 7.63. A linear transformation L[ v ] = Q v defines a Euclidean isometry
of R n if and only if Q is an orthogonal matrix.
Proof : The linear isometry condition (7.75) requires that
k Q v k2 = (Q v)T Q v = vT QT Q v = vT v = k v k2

for all

v Rn.

Clearly this holds if and only if QT Q = I , which, (5.32), is the condition for orthogonality.
Q.E.D.
Remark : It can be proved, [Cohn], that the most general Euclidean isometry of R n
is an affine transformation F [ v ] = Q v + a where Q is an orthogonal matrix and a is
a constant vector. Therefore, every Euclidean isometry is a combination of translations,
rotations and reflections.
The isometries of R 3 are fundamental to the understanding of how objects move in
three-dimensional space. Basic computer graphics and animation require efficient implementation of rigid isometries in three-dimensional space, coupled with appropriate (nonlinear) perspective transformations that govern the projection of three-dimensional objects
onto a two-dimensional viewing screen.

3/7/03

264

c 2003

Peter J. Olver

Chapter 8
Eigenvalues and Dynamics
So far, we have concentrated on statics: unchanging equilibrium configurations of
mass/spring chains, circuits, and structures. It is now time to introduce motion into our
universe. In general, a dynamical system refers to the (differential) equations governing the
temporal behavior of some physical system. In a discrete system, the dynamical behavior
of, say, a massspring chain, a simple electrical circuit, or the vibrations of a structure,
is governed by a system of ordinary differential equations. Dynamics of continuous media
fluids, solids and gasses are governed by partial differential equations, and will form
the focus of the later chapters.
The goal of this chapter is to understand the behavior of the simplest class of dynamical systems constant coefficient linear systems of ordinary differential equations.
We begin with a very quick review of the scalar case, whose solutions are exponential
functions. Applying a similar exponential ansatz to the vector version leads us naturally
to the all-important notions of eigenvalue and eigenvector for a square matrix. The next
three sections are devoted to the basic properties of eigenvalues and eigenvectors. In particular, complete or diagonalizable matrices produce bases consisting of eigenvectors, and
computations become significantly simpler when performed in the eigenvector basis. The
most important class are the symmetric matrices, whose eigenvectors form an orthogonal
basis of R n ; in fact, this is by far the most common way for orthogonal bases to appear.
For a first order system, the eigenstates describe the basic modes of exponential
growth, decay, or periodic behavior. The stability of the equilibrium solution is almost
entirely determined by the eigenvalues of the associated matrix, which explains their ubiqitous role in physical phenomena. Most of the important phenomena already appear in the
two-dimensional systems, and we devote Section 8.7 to a complete description of the possible behaviors.
For a mechanical system without damping or frictional effects, the eigenstates are the
normal modes of the system, each periodically vibrating with its associated fundamental
frequency. Linearity then allows us to describe the general motion as a linear superposition
of the individual pure periodic normal modes of vibration. Such a linear combination
will, in general, no longer be periodic, and so the motion can appear to be quite erratic.
Nevertheless, it is merely the superposition of very simple periodic motions called
quasi-periodic, and, unlike a chaotic, nonlinear system, is eminently predictable. When
the system is forced, the result is a superposition of the free quasi-periodic motion and a
particular reaction of the system to the forcing. In particular, periodic forcing will typically
lead to quasiperiodic motion, unless we try to force the system at one of the fundamental
frequencies; this will lead to the phenomenon of resonance, where the vibrations become
3/7/03

265

c 2003

Peter J. Olver

larger and larger and the system breaks apart.


Many of the observations in this chapter are fundamental to general dynamical systems, and, as we shall see, apply equally well to the continuous case, where the physical
system is governed by a linear partial differential equation. For example, the orthogonal
bases of functions appearing in Fourier analysis and solution of partial differential equations arise as the eigenvectors, or, rather, eigenfunctions of symmetric boundary value
problems for linear differential operators. However, before making this leap in abstraction,
we need to properly understand the finite-dimensional matrix version first. Finally, even
when the physics forces us to consider nonlinear systems, the tools from the linear regime
will be essential for navigating these far more treacherous waters.

8.1. First Order Linear Systems of Ordinary Differential Equations.


In this chapter, we will study systems consisting of n linear, constant-coefficient ordinary differential equations
du1
= a11 u1 + a12 u2 +
dt
du2
= a21 u1 + a22 u2 +
dt
..
.
dun
= an1 u1 + an2 u2 +
dt

+ a1n un ,
+ a2n un ,

(8.1)

..
.
+ ann un ,

involving n unknown functions u1 (t), u2 (t), . . . , un (t) depending on a scalar variable t R,


which we usually view as time. Such systems can be written in the compact matrix form
du
= A u,
dt

(8.2)
T

where A is a constant nn matrix, and u(t) = ( u1 (t), . . . , un (t) ) a vector-valued function.


Each solution u(t) serves to parametrize a curve in R n , called a solution trajectory for the
system. Our immediate goal is to develop solution techniques for such systems.
The Scalar Case
We begin by analyzing the elementary scalar ordinary differential equation
du
= a u.
dt

(8.3)

in detail. Here a R is a real constant, while the unknown u(t) is a scalar function.
As you learned in calculus, the general solution to (8.3) is an exponential function
u(t) = c ea t .

(8.4)

The integration constant c is uniquely determined by a single initial condition


u(t0 ) = u0
3/7/03

266

(8.5)
c 2003

Peter J. Olver

imposed at an initial time t0 . Substituting t = t0 into (8.4), we find


u(t0 ) = c ea t0 = u0 ,

and so

c = u 0 e a t0 .

We conclude that there is a unique solution to the scalar initial value problem (8.3), (8.5),
namely
u(t) = u0 ea(tt0 ) .
(8.6)
Example 8.1. The radioactive decay of an isotope, say Uranium 238, is governed
by the stable differential equation
du
= u.
(8.7)
dt
Here u(t) denotes the amount of the isotope remaining at time t, and the coefficient
> 0 governs the decay rate. The solution is given by an exponentially decaying function
u(t) = c e t , where c = u(0) is the initial amount of radioactive material.
The half-life is the time it takes for half of a sample to decay. Therefore, the half-life
t? will occur when u(t? ) = 12 u(0). This requires solving the equation
?

e t = 12 ,

so that

t? =

log 2
.

(8.8)

Let us make some elementary, but pertinent observations about this simple linear
dynamical system. First of all, since the equation is homogeneous, the zero function
u(t) 0 (corresponding to c = 0) is a constant solution, known as an equilibrium solution
or fixed point, since it does not depend on t. If the coefficient a > 0 is positive, then the
solutions (8.4) are exponentially growing (in absolute value) as t + . This implies that
the zero equilibrium solution is unstable. The initial condition u(t 0 ) = 0 produces the zero
solution, but if we make a tiny error (either physical, numerical, or mathematical) in the
initial data, say u(t0 ) = , then the solution u(t) = ea(tt0 ) will eventually get very far
away from equilibrium. More generally, any two solutions with very close, but not equal,
initial data, will eventually become arbitrarily far apart: | u 1 (t) u2 (t) | as t .
One consequency is the inherent difficulty in accurately computing the long time behavior
of the solution, since small numerical errors will eventually have very large effects.
On the other hand, if a < 0, the solutions are exponentially decaying in time. In this
case, the zero solution is stable, since a small error in the initial data will have a negligible
effect on the solution. In fact, the zero solution is globally asymptotically stable. The
phrase asymptotically stable implies that solutions that start out near zero eventually
return; more specifically, if u(t0 ) = is small, then u(t) 0 as t . The adjective
globally implies that this happens no matter how large the initial data is. In fact, for
a linear system, the stability (or instability) of an equilibrium solution is always a global
phenomenon.
The borderline case is when a = 0. Then all the solutions to (8.3) are constant. In this
case, the zero solution is stable indeed, globally stable but not asymptotically stable.
The solution to the initial value problem u(t0 ) = is u(t) . Therefore, a solution that
3/7/03

267

c 2003

Peter J. Olver

-1

-0.5

0.5

-1

-0.5

0.5

-1

-0.5

0.5

-2

-2

-2

-4

-4

-4

-6

-6

a<0

-6

a=0
Figure 8.1.

a>0

Solutions to u = a u.

starts out near equilibrium will remain near, but will not asymptotically return. The three
qualititatively different possibilities are illustrated in Figure 8.1.
Similar stability results hold for linear systems (8.2) in several unknowns, but to
understand them, we must acquire some familiarity with the basic solution techniques.
The Phase Plane
Many physical phenomena are modeled by second order ordinary differential equations.
The simplest scalar version is a linear, homogeneous equation
du
d2 u
+

+ u = 0,
(8.9)
dt2
dt
in which , are constants. In your first course on ordinary differential equations, you
learned how to solve such equations; the basic method is reviewed in Example 7.23 and in
the following example.
There is a standard trick to convert any second order scalar equation, e.g., (8.9) into
a first order system. One introduces the variables

u2 = u =

u1 = u,

du
.
dt

(8.10)

These variables satisfy


du
du2
d2 u
du
du1
=
= u2 ,
= 2 = u
= u1 u2 .
dt
dt
dt
dt
dt
In this manner, the second order equation is converted into the first order system

0
1
u1 (t)
.
, and the coefficient matrix is A =
u = A u, where u(t) =

u2 (t)
(8.11)

The (u1 , u2 ) = (u, u) plane is referred to as the phase plane. The solutions u(t) to
(8.11) travel along curves in the phase plane the solution trajectories. In particular,

We will sometimes use dots as a shorthand notation for time derivatives.

3/7/03

268

c 2003

Peter J. Olver

the equilibrium solution u(t) 0 remains fixed at the origin, but all other solutions
describe genuine curves. The collection of all possible solution trajectories is called the
phase portrait of the system. An important fact is that, for a (constant coefficient) first
order system, the phase plane trajectories never cross. This property, which also applies
to nonlinear systems, is a consequence of the uniqueness properties of solutions, and will
be discussed in detail in Section 19.2. The one feature that is not so easily pictured in
the phase portrait is the speed at which the solution moves along the phase curves this
would require a more complicated three-dimensional plot with time on the third axis.
It is not hard to verify that every solution u(t) to the second order equation yields
T

to the phase plane system (8.11). Vice versa, if u(t) =


a solution u(t) = u(t), u(t)
T
( u1 (t), u2 (t) ) is any solution to the system (8.11), then its first component u(t) = u1 (t)
defines a solution to the original scalar equation (8.9). We conclude that the scalar equation
and its phase plane version are completely equivalent; solving one will immediately lead
to a solution of the other.
Example 8.2. Consider the second order equation
d2 u du
+
6 u = 0.
dt2
dt
The equivalent phase plane system is

du
0 1
=
u,
or, in full detail,
6 1
dt

(8.12)

u1 = u2 ,

u2 = 6 u 1 u2 .

(8.13)

To solve the equation, we substitute an exponential ansatz u(t) = e t into the equation;
the result is the characteristic equation
2 + 6 = 0,

with solutions

1 = 2,

2 = 3.

We conclude that e2 t and e 3 t form a basis for the solution space to (8.12), and hence the
general solution can be written as a linear combination
u(t) = c1 e2 t + c2 e 3 t ,
where c1 , c2 are arbitary constants.
Our identification (8.10) of the phase plane variables tells us that the solution to the
equivalent phase ploane system (8.13) is given by
u1 (t) = u(t) = c1 e2 t + c2 e 3 t ,
du
= 2 c 1 e2 t 3 c 2 e 3 t ,
u2 (t) =
dt

and hence

u(t) =

c1 e2 t + c 2 e 3 t
.
2 c 1 e2 t 3 c 2 e 3 t

A plot of the phase plane trajectories u(t) for variaous choices of the constants c 1 , c2
appears in Figure 8.2. The horizontal axis represents the solution u 1 = u(t) whereas

the vertical axis represents is derivative u2 = u(t). With some practice, one learns to
understand the temporal behavior of the solution from studying its phase plane trajectory.
Many more examples appear in Section 8.7 below.
3/7/03

269

c 2003

Peter J. Olver

Figure 8.2.

du
Phase Plane Trajectories for
=
dt

0 1
6 1

u.

8.2. Eigenvalues and Eigenvectors.


Let us now discuss primary goal how to solve a linear system of ordinary differential
equations
du
= A u,
dt
with constant coefficient matrix. Since exponential functions describe the solution in the
scalar case, we suspect that they will also play a role here. To keep matters as simple as
possible, let us investigate whether the components ui (t) = vi e t of our desired solution
u(t) = (u1 (t), . . . , un (t))T are constant multiples of the same exponential function. In
other words, we make the solution ansatz
u(t) = e t v.

(8.14)

Here is a constant scalar, so e t is a scalar function of t, while v R n is a constant


vector. Our goal is to determine both the scalar and the vector v so that (8.14) is a
solution to the system. Now, the derivative of u(t) is
d t
du
e v = e t v,
=
dt
dt

since v is constant. On the other hand, since e t is a scalar, it commutes with matrix
multiplication, and so
A u = A e t v = e t A v.
Therefore, u(t) will solve the system if and only if
e t v = e t A v,
or, canceling the common scalar factor e t ,
v = A v.

See the footnote in Chapter 7 for an explanation of the term.

3/7/03

270

c 2003

Peter J. Olver

The result is a system of algebraic equations relating the vector v and the scalar .
The preceding analysis motivates the following fundamental definition. As stated, it
applies to square matrices; later, the underlying idea will be generalized to linear operators
on function space, with applications to dynamical systems of partial differential equations.
Definition 8.3. Let A be an n n matrix. A scalar is called an eigenvalue of A
if there is a non-zero vector v 6= 0, called an eigenvector , such that
A v = v.

(8.15)

Remark : The rather strange-looking terms eigenvalue and eigenvector are hybrid
GermanEnglish words. In German, they are Eigenwert and Eigenvektor , which can be
translated as proper value and proper vector. For some reason, the half-translated
terms have acquired a certain charm, and are now standard. The alternative English terms
characteristic value and characteristic vector can be found in some (mostly older) texts.
Oddly, characteristic equation, defined below, has survived.
The requirement that the eigenvector v be nonzero is important, since v = 0 is a
trivial solution to the eigenvalue equation (8.15) for any scalar . Moreover, as far as
solving linear ordinary differential equations goes, the zero vector v = 0 only gives the
trivial zero solution u(t) 0.
The eigenvalue equation (8.15) is a linear homogeneous equation for the eigenvector v
provided the eigenvalue is known but is mildly nonlinear as a combined equation
for and v. Gaussian elimination per se will not solve the problem, and we are in need of
a new idea. Let us begin by rewriting the equation in the form
(A I )v = 0,

(8.16)

where I is the identity matrix of the correct size . Now, for given , equation (8.16) is a
homogeneous linear system for v, and always has the trivial zero solution v = 0. But we
are specifically seeking a nonzero solution! According to Theorem 1.44, a homogeneous
linear system has a nonzero solution, v 6= 0, if and only if its coefficient matrix, which
in this case is A I , is a singular matrix. This is the key to resolving the eigenvector
equation.
Theorem 8.4. A scalar is an eigenvalue of the square matrix A if and only if the
matrix A I is singular. The corresponding eigenvectors are the nonzero solutions v 6= 0
to the eigenvalue equation (A I )v = 0.
We know a number of tests for singularity, including the determinantal criterion given
in Theorem 1.49. Therefore, the following result is an immediate corollary of Theorem 8.4.

Note that we cannot write (8.16) in the form (A )v = 0 since we do not know how to
subtract a scalar from a matrix A. Strangely, if we type A in Matlab, the program will
subtract from all the entries of A, which is not what we are after!

3/7/03

271

c 2003

Peter J. Olver

Theorem 8.5. A scalar is an eigenvalue of the matrix A if and only if is a


solution to the characteristic equation
det(A I ) = 0.

(8.17)

In practice, to find eigenvalues and eigenvectors by hand, one first solves the characteristic equation (8.17). Then, for each eigenvalue one uses standard linear algebra
methods, i.e., Gaussian elimination, to solve the linear eigenvector equation (8.16) for the
eigenvector v.
Remark : The reader may recall that we said one should never compute determinants
in practical examples. So why using determinants to find eigenvalues? The truthful answer
is that the practical computation of eigenvalues and eigenvectors never uses the characteristic equation! There are just too many numerical difficulties in a) computing the determinant, and b) solving the resulting polynomial equation. Nevertheless, the characteristic
equation does give us important theoretical insight into the structure of the eigenvalues of
a matrix, and can be used on small, e.g., 2 2 and 3 3, matrices. Numerical algorithms
for computing eigenvalues and eigenvectors are based on completely different ideas, and
will be discussed in Section 9.6.
Example 8.6. Consider the 2 2 matrix

3 1
.
A=
1 3
We compute the determinant in the characteristic equation using (1.79):

3
1
= (3 )2 1 = 2 6 + 8.
det(A I ) = det
1
3
The characteristic equation is a quadratic polynomial equation, and can be solved by
factorization :
2 6 + 8 = ( 4) ( 2) = 0.
We conclude that A has two eigenvalues: 1 = 4 and 2 = 2.
For each eigenvalue, the corresponding eigenvectors are found by solving the associated
homogeneous linear system (8.16). For the first eigenvalue, the corresponding eigenvector
equation is

x + y = 0,
0
x
1 1
,
or
=
(A 4 I ) v =
0
y
1 1
x y = 0.
The general solution is

x = y = a,

so



a
1
v=
=a
,
a
1

Alternatively, one can invoke the quadratic formula.

3/7/03

272

c 2003

Peter J. Olver

where a is an arbitrary scalar. Only the nonzero solutions count as eigenvectors, and so
the eigenvectors for the eigenvalue 1 = 4 have a 6= 0, i.e., they are all nonzero scalar
T
multiples of the eigenvector v1 = ( 1 1 ) .
Remark : If v is an eigenvector of A for the eigenvalue , then so is any nonzero scalar
multiple of v. In practice, we only distinguish linearly independent eigenvectors. Thus, in
T
this example, we shall say v1 = ( 1, 1 ) is the eigenvector corresponding to the eigenvalue
1 = 4, when we really mean that the eigenvectors for 1 = 4 consist of all nonzero scalar
multiples of v1 .
Similarly, for the second eigenvalue 2 = 2, the eigenvector equation is


1 1
x
0
(A 2 I ) v =
=
.
1 1
y
0
T

The solution ( a, a ) = a ( 1, 1 ) is the set of scalar multiples of the eigenvector


T
v2 = ( 1, 1 ) . Therefore, the complete list of eigenvalues and eigenvectors (up to scalar
multiple) is


1
1
.
,
2 = 2,
v2 =
1 = 4,
v1 =
1
1
Example 8.7. Consider the 3 3 matrix

0 1 1
A = 1 2
1 .
1 1
2

Using the formula (1.88) for a 3 3 determinant, we compute

1
1
det(A I ) = det 1 2
1
1
1
2

= ( )(2 )2 + (1) 1 1 + (1) 1 1

1 (2 )(1) 1 1 ( ) (2 ) 1 (1)
= 3 + 4 2 5 + 2.
The cubic characteristic equation can be factorized:
3 + 4 2 5 + 2 = ( 1)2 ( 2) = 0.
Most 3 3 matrices have three different eigenvalues, but this particular one has only two:
1 = 1, which is called a double eigenvalue since it is a double root of the characteristic
equation, along with a simple eigenvalue 2 = 2.

If, at this stage, you end up with a linear system with only the trivial zero solution, youve
done something wrong! Either you dont have an correct eigenvalue maybe you made a mistake
setting up and/or solving the characteristic equation or youve made an error solving the
eigenvector system.

3/7/03

273

c 2003

Peter J. Olver

The eigenvector equation (8.16) for the double eigenvalue 1 = 1 is


0
1 1 1
x

(A I )v =
1
1
1
y = 0.
0
1
1
1
z

The general solution to this homogeneous linear system

a b
1
1
v = a = a 1 + b 0
b
0
1

depends upon two free variables, y = a, z = b. Any nonzero solution forms a valid
eigenvector for the eigenvalue 1 = 1, and so the general eigenvector is any non-zero linear
T
b1 = ( 1, 0, 1 )T .
combination of the two basis eigenvectors v1 = ( 1, 1, 0 ) , v
On the other hand, the eigenvector equation for the simple eigenvalue 2 = 2 is


0
2 1 1
x

(A 2 I )v =
1
0
1
y = 0.
0
1
1
0
z

The general solution

1
a
v = a = a 1
1
a

consists of all scalar multiple of the eigenvector v2 = ( 1, 1, 1 ) .


In summary, the eigenvalues and basis eigenvectors for this matrix are

1
1
b1 = 0 ,
v
1 = 1,
v1 = 1 ,
1
0

1
2 = 2,
v2 = 1 .
1

(8.18)

In general, given an eigenvalue , the corresponding eigenspace V R n is the subspace spanned by all its eigenvectors. Equivalently, the eigenspace is the kernel
V = ker(A I ).

(8.19)

In particular, is an eigenvalue if and only if V 6= {0} is a nontrivial subspace. Every


nonzero element of V is an eigenvector with eigenvalue . The most economical way to
indicate each eigenspace is by writing out a basis, as in (8.18).

1 2 1
Example 8.8. The characteristic equation of A = 1 1 1 is
2 0 1
0 = det(A I ) = 3 + 2 + 5 + 3 = ( + 1)2 ( 3).

3/7/03

274

c 2003

Peter J. Olver

Again, there is a double eigenvalue 1 = 1 and a simple eigenvalue 2 = 3. However, in


this case the matrix

2 2 1
A 1 I = A + I = 1 0 1
2 0 2

has only a one-dimensional kernel or eigenspace, even though 1 is a double eigenvalue.


The list of eigenvalues and eigenvectors is, in a sense, incomplete:


2
2
1 = 1,
v1 = 1 ,
2 = 3,
v2 = 1 .
2
2

1 2 0

Example 8.9. Finally, consider the matrix A = 0 1 2 . The characteristic


2 2 1
equation is
0 = det(A I ) = 3 + 2 3 5 = ( + 1) (2 2 + 5).

The linear factor yields the eigenvalue 1. The quadratic factor leads to two complex
roots, 1 + 2 i and 1 2 i , which can be obtained via the quadratic formula. Hence A has
one real and two complex eigenvalues:
1 = 1,

2 = 1 + 2 i ,

3 = 1 2 i .

Complex eigenvalues are as important as real eigenvalues, and we need to be able to handle
them too. To find the corresponding eigenvectors, which will also be complex, we need to
solve the usual eigenvalue equation (8.16). For example, the eigenvector(s) for 2 = 1 + 2 i
are found by solving the complex homogeneous linear system

0
x
2 i
2
0

y = 0.
0
2 i
2
(A (1 + 2 i ) I )v =
0
z
2
2
2 2 i

This homogeneous linear system can be solved by Gaussian elimination (with complex
pivots). A simpler approach is to work directly: the first equation 2 i x + 2 y = 0 tells us
that y = i x, while the second equation 2 i y 2 z = 0 says z = i y = x. If we trust our
calculations so far, we do not need to solve the final equation 2 x + 2 y + (2 2 i )z = 0,
since we know that the coefficient matrix is singular and hence it must be a consequence
of the first two equations. (However, it does serve as a useful check on our work.) So,
T
the general solution v = ( x, i x, x ) is an arbitrary constant multiple of the complex
T
eigenvector v2 = ( 1, i , 1 ) .
Summarizing, the matrix under consideration has three complex eigenvalues and three
corresponding eigenvectors, each unique up to (complex) scalar multiple:
1 = 1,

1
v1 = 1 ,
1
3/7/03

2 = 1 + 2 i ,

1
v2 = i ,
1
275

3 = 1 2 i ,

1
v3 = i .
1
c 2003

Peter J. Olver

Note that the third complex eigenvalue is the complex conjugate of the second, and the
eigenvectors are similarly related. This is a general fact for real matrices:
Proposition 8.10. If A is a real matrix with a complex eigenvalue = + i and
corresponding complex eigenvector v = x + i y, then the complex conjugate = i is
also an eigenvalue with complex conjugate eigenvector v = x i y.
Proof : First take complex conjugates of the eigenvalue equation (8.15)
Av = A v = v = v .
Using the fact that a real matrix is unaffected by conjugation, A = A, we conclude
A v = v,

(8.20)

which is the eigenvalue equation for the eigenvalue and eigenvector v.

Q.E.D.

As a consequence, when dealing with real matrices, one only needs to compute the
eigenvectors for one of each complex conjugate pair of eigenvalues. This observation effectively halves the amount of work in the unfortunate event that we are confronted with
complex eigenvalues.
Basic Facts about Eigenvalues
If A is an n n matrix, then its characteristic polynomial is
pA () = det(A I ) = cn n + cn1 n1 + + c1 + c0 .

(8.21)

The fact that pA () is a polynomial of degree n is a consequence of the general determinantal formula (1.87). Indeed, since every term in (1.87) is plus or minus a product of
entries of the matrix, one from each row and one from each column. The term obtained
from multiplying the all the diagonal entries together is

(a11 ) (a22 ) (ann ) = (1)n n +(1)n1 a11 + a22 + + ann n1 + ,


(8.22)
and corresponds to the identity permutation term in (1.87). All other terms in (1.87) have
at most n 2 diagonal factors aii , and so are polynomials of degree n 2 in ,
proving that pA () is a polynomial. Moreover, (8.22) is the only summand containing the
highest two powers, n and n1 , and hence their respective coefficients in pA () are
cn = (1)n ,

cn1 = (1)n1 (a11 + a22 + + ann ) = (1)n1 tr A,

(8.23)

where tr A, called the trace of A, is the sum of its diagonal entries. The other coefficients
cn2 , . . . , c1 , c0 are more complicated combinations of the entries of A. However, if we
set = 0 in the characteristic polynomial equation (8.21), we find p A (0) = det A = c0 ,
and hence

the constant term equals the determinant of the matrix. In particular, if A =


a b
is a 2 2 matrix, the characteristic polynomial has the form
c d

a
b
det(A I ) = det
= 2 (a + d) + (a d b c) = 2 (tr A) + (det A).
c
d
(8.24)
3/7/03

276

c 2003

Peter J. Olver

Therefore, the characteristic equation (8.17) of an n n matrix A is a polynomial


equation of degree n, namely pA () = 0. According to the Fundamental Theorem of
Algebra (see Corollary 15.63 in Chapter 15) every (complex) polynomial of degree n can
be completely factored over the complex numbers:
pA () = (1)n ( 1 )( 2 ) ( n ).

(8.25)

The complex numbers 1 , . . . , n , some of which may be repeated, are the roots of the
characteristic equation pA () = 0, and hence the eigenvalues of the matrix A. Therefore,
we immediately conclude:
Theorem 8.11. An n n matrix A has at least one and at most n complex eigenvalues.
Most n n matrices meaning those for which the characteristic polynomial factors
into n distinct factors have exactly n complex eigenvalues. More generally, an eigenvalue
j is said to have multiplicity m if the factor ( j ) appears exactly m times in the
factorization (8.25) of the characteristic polynomial. An eigenvalue is simple if it has
multiplicity 1. In particular, A has n distinct eigenvalues if and only if all its eigenvalues
are simple.
An example of a matrix with only one eigenvalue, of multiplicity n, is the nn identity
matrix I , whose only eigenvalue is = 1. In this case, every nonzero vector in R n is an
eigenvector of the identity matrix, and so the eigenspace is all of R n . At the other extreme,
for the bidiagonal Jordan block matrix

J =

1
..

..

(8.26)

the only eigenvalue is , again of multiplicity n. Now there is only one eigenvector (up to
scalar multiple), which is the standard basis vector en .
Remark : If is a complex eigenvalue of multiplicity k for the real matrix A, then its
complex conjugate also has multiplicity k. This is because complex conjugate roots of a
real polynomial necessarily appear with identical multiplicities, [ALA].
Remark : If n 4, then one can, in fact, write down an explicit formula for the
solution to a polynomial equation of degree n, and hence explicit (but not particularly
helpful) formulae for the eigenvalues of 2 2, 3 3 and 4 4 matrices. As soon as n 5,
there is no explicit formula (at least in terms of radicals), and so one must usually resort
to numerical approximations. This remarkable and deep algebraic result was proved by
the young Norwegian mathematician Nils Hendrik Abel in the early part of the nineteenth
century.
3/7/03

277

c 2003

Peter J. Olver

If we explicitly multiply out the factored product (8.25) and equate the result to the
characteristic polynomial (8.21), we find that its coefficients c0 , c1 , . . . cn1 can be written
as certain polynomials of the roots, known as the elementary symmetric polynomials; see
Exercise . The first and last are of particular importance:
cn1 = (1)n1 (1 + 2 + + n ),

c 0 = 1 2 n ,

(8.27)

Comparing these formulae with our previous formulae for the coefficients c 0 and cn1
proves the following useful result.
Proposition 8.12. The sum of the eigenvalues of a matrix equals its trace:
1 + 2 + + n = tr A = a11 + a22 + + ann .

(8.28)

The product of the eigenvalues equals its determinant:


1 2 n = det A.

(8.29)

Remark : For repeated eigenvalues, one must add or multiply them in the formulae
(8.28), (8.29) according to their multiplicity.
Example 8.13. The matrix

1
A = 1
2

2
1
0

1
1
1

considered in Example 8.8 has trace and determinant


tr A = 1,

det A = 3.

These fix, respectively, the coefficient of 2 and the constant term in the characteristic
equation. The eigenvalues are 1 and 3, with 1 being a double eigenvalue. For this
particular matrix, formulae (8.28), (8.29) become
tr A = (1) + (1) + 3,

det A = (1)(1) 3.

8.3. Eigenvector Bases and Diagonalization.


Most of the vector space bases that play a distinguished role in applications consist
of eigenvectors of a particular matrix. In this section, we show that the eigenvectors for
any complete matrix automatically form a basis for R n or, in the complex case, C n . In
the following subsection, we use the eigenvector basis to rewrite the linear transformation
determined by the matrix in a simple diagonal form.
The first task is to show that eigenvectors corresponding to distinct eigenvalues are
necessarily linearly independent.
Lemma 8.14. If 1 , . . . , k are distinct eigenvalues of the same matrix A, then the
corresponding eigenvectors v1 , . . . , vk are linearly independent.
3/7/03

278

c 2003

Peter J. Olver

Proof : We use induction on the number of eigenvalues. The case k = 1 is immediate


since an eigenvector cannot be zero. Assume that we know the result for k 1 eigenvalues.
Suppose we have a linear combination
c1 v1 + + c k vk = 0

(8.30)

which vanishes. Let us multiply this equation by the matrix A:

A c1 v1 + + ck vk = c1 A v1 + + ck A vk = c1 1 v1 + + ck k vk = 0.
On the other hand, if we just multiply the original equation by k , we also have
c1 k v1 + + ck k vk = 0.
Subtracting this from the previous equation, the final terms cancel and we are left with
the equation
c1 (1 k )v1 + + ck1 (k1 k )vk1 = 0.
This is a vanishing linear combination of the first k 1 eigenvectors, and so, by our
induction hypothesis, can only happen if all the coefficients are zero:
c1 (1 k ) = 0,

...

ck1 (k1 k ) = 0.

The eigenvalues were assumed to be distinct, so j 6= k when j 6= k; consequently,


c1 = = ck1 = 0. Substituting these values back into (8.30), we find ck vk = 0, and
so ck = 0 also, since the eigenvector vk 6= 0. Thus we have proved that (8.30) holds if
and only if c1 = = ck = 0, which implies the linear independence of the eigenvectors
v1 , . . . , vk . This completes the induction step.
Q.E.D.
The most important consequence of this result is stated in the corollary.
Theorem 8.15. If the n n real matrix A has n distinct real eigenvalues 1 , . . . , n ,
then the corresponding real eigenvectors v1 , . . . , vn form a basis for R n . If A (which may
now be either a real or a complex matrix) has n distinct complex eigenvalues, then its
eigenvectors form a basis for C n .
If a matrix has multiple eigenvalues, then there may or may not be an eigenvector
basis of R n (or C n ). The matrix in Example 8.7 has an eigenvector basis, whereas the
matrix in Example 8.8 does not. In general, it can be proved, [la], that the dimension of
the eigenspace is less than or equal to the multiplicity of the eigenvalue. In particular, a
simple eigenvalue has a one-dimensional eigenspace, and hence, up to scalar multiple, only
one associated eigenvector.
Definition 8.16. An eigenvalue of a matrix A is called complete if its eigenspace
V = ker(A I ) has the same dimension as its multiplicity. The matrix A is complete if
all its eigenvalues are.
Remark : The multiplicity of an eigenvalue i is sometimes referred to as its algebraic
multiplicity. The dimension of the eigenspace V is called the geometric multiplicity, and
so completeness requires that the two multiplicities are equal.
3/7/03

279

c 2003

Peter J. Olver

Note that a simple eigenvalue is automatically complete, and so only multiple eigenvalues can cause the incompleteness of a matrix.
Theorem 8.17. An n n real or complex matrix A is complete if and only if its
complex eigenvectors span C n . In particular, any n n matrix that has n distinct complex
eigenvalues is complete.
A n n matrix is incomplete if it does not have n linearly independent complex
eigenvectors. Most matrices, including those with all simple eigenvalues, are complete.
Incomplete matrices are more tricky to deal with, and we relegate most of the messy
details to Appendix D.
Remark : We already noted that complex eigenvectors of a real matrix always appear
in conjugate pairs, v = x i y. It can be shown that the real and imaginary parts of these
vectors will form a real basis for R n . (See Exercise for the underlying principle.) For
T
T
instance, in Example 8.9, the complex eigenvectors are given as ( 1, 0, 1 ) i ( 0, 1, 0 ) ,
and the vectors

0
1
1
1 ,
0 ,
1 ,
0
1
1
consisting of the real eigenvalue and the real and imaginary parts of the complex eigenvalues, provide a basis for R 3 .
Diagonalization
Every n n matrix A represents a linear transformation L: R n R n , namely the
function given by matrix multiplication L[ u ] = A u. As we learned in Section 7.4, the
matrix representing a linear transformation depends upon the basis of R n it is represented
in. Some bases give a particular simple matrix


representation.
xy
x
studied in Example 7.58
=
For example, the linear transformation L
2x 4y
y

1 1
is given by multiplication by the matrix A =
, when expressed in terms of the
2 4

1
1
2
, the
, v2 =
standard absis of R . In terms of the alternative basis v1 =
2
1
2 0
linear transformation was represented by a diagonal matrix,
. This followed from
0 3
the action of the linear transformation on the new basis: A v1 = 2 v1 and A v2 = 3 v2 . Now
we can understand the reason behind this simplification. The new basis consts of the two
eigenvectors of the matrix A. this observation isindicative of a general fact: representing
a linear transformation in terms of an eigenvector basis has the effect of replacing its
matrix representative by a simple diagonal form. The effect is to diagonalize the original
coefficient matrix.
According to (7.66), if v1 , . . . , vn form a basis of R n , then the matrix representative
T
in the new basis is given by B = S 1 A S, where S = ( v1 v2 . . . vn ) is the matrix
3/7/03

280

c 2003

Peter J. Olver

1
1

1
, and we
2

whose columns are the basis vectors. In the preceding example, S =

2 0
1
is a diagonal matrix. This serves to motivate the general
find that S A S =
0 3
definition.
Definition 8.18. A square matrix A is called diagonalizable if there exists a nonsingular matrix S and a diagonal matrix = diag(1 , . . . , n ) such that
S 1 A S = .

(8.31)

To understand the diagonalization equation (8.31), we rewrite it in the equivalent


form
A S = S .
(8.32)
Using the basic property (1.11) of matrix multiplication, one easily sees that the k th column
of this n n matrix equation is given by
A v k = k vk .
Therefore, the columns of S are eigenvectors with the entries of the diagonal matrix
being the corresponding eigenvalues! In order to be diagonalizable, the matrix A must
have n linearly independent eigenvectors, i.e., an eigenvector basis, to form the columns of
the diagonalizing matrix S. Since the diagonal form contains the eigenvalues along its
diagonal, it is uniquely determined up to a permutation of its entries.
Now, as we know, not every matrix has an eigenvector basis. Moreover, even when
it exists, the eigenvector basis may be complex, in which case the entries of the diagonal
matrix would be the complex eigenvalues. Thus, we must distinguish betweem matrices
that are diagonalizable over the complex numbers and the more restrictive class of matrices
which can be diagonalized by a real matrix S.
We have now proved the following important result.
Theorem 8.19. A matrix is complex diagonalizable if and only if it is complete. A
matrix is real diagonalizable if and only if it is complete and has all real eigenvalues.
Remark : Many authors exclusively use the term diagonalizable for what we have
called complete matrices.

0 1 1
Example 8.20. The 3 3 matrix A = 1 2
1 considered in Example 8.6
1 1
2
has eigenvector basis

1
1
1
v3 = 1 .
v2 = 0 ,
v1 = 1 ,
1
1
0
3/7/03

281

c 2003

Peter J. Olver

We assemble these to form the eigenvector matrix

1 0
1 1 1
and so
S 1 = 1 1
S= 1
0
1
1
1
0
1
1
The diagonalization

1
S 1 A S = 1
1

equation (8.31) becomes

0 1
0 1 1
1
1 0 1 2
1 1
1
1
1 1
2
0

1
0 .
1


1 1
1
0
1 = 0
1
1
0

0 0
1 0 = ,
0 2

with the eigenvalues of A appearing on the diagonal of , in the same order as the eigenvectors.

A diagonal matrix represents a linear transformation that simultaneously stretches


in the direction of the basis vectors. Thus, when expressed in an eigenvalue basis, every
complete matrix represents a elementary combination of (complex) stretching transformations.
If a matrix is not complete, then it cannot be diagonalized, and the best that can
be done is to place it into the so-called Jordan canonical form. A precise statement of
this result can be found in Appendix D. Incomplete matrices represent

genuine shearing
1 c
transformations. A simple example is a matrix of the form
for c 6= 0, which
0 1
represents a shear in the direction of the x axis.

8.4. Eigenvalues of Symmetric Matrices.


Fortunately, the matrices that arise in most applications possess some additional structure that helps to ameliorate the calculation of their eigenvalues and eigenvectors. The
most prevalent are those complete matrices that have only real eigenvalues and hence a
real eigenvector basis. The most important class of matrices with this property are the
symmetric matrices. In fact, not only are the eigenvalues of a symmetric matrix necessarily
real, the eigenvectors always form an orthogonal basis. In such situations, we can tap into
the dramatic simplification and power of orthogonal bases developed in Chapter 5. In fact,
this is by far the most common way for orthogonal bases to appear as the eigenvalue
basis of symmetric matrices.
Theorem 8.21. If A = AT be a real symmetric n n matrix, Then
(a) All the eigenvalues of A are real.
(b) Eigenvectors corresponding to distinct eigenvalues are orthogonal.
(c) There is an orthonormal basis of R n consisting of n eigenvectors of A.

If the diagonal entry is negative, the stretch becomes a reflection and stretch. In the complex
case, we have some sort of complex stretching transformation. See Section 7.4 for details.

3/7/03

282

c 2003

Peter J. Olver

Remark : Orthogonality is with respect to the standard dot product on R n . As we


noted in Section 7.3, the transpose operation is intimately connected with the dot product.
Intorducing a more general inner product on R n leads to the concept of a self-adjoint linear
transformation, and an analogous result holds in this more general context; see Exercise
.

3 1
considered in Example 8.6 is symExample 8.22. The 2 2 matrix A =
1 3
metric, and so has real eigenvalues. The reader can check that the two eigenvectors
T
T
v1 = ( 1 1 ) , v2 = ( 1 1 ) are orthogonal: v1 v2 = 0. An orthonormal basis for
R 2 is given by the unit eigenvectors
1 !

2
2
u1 =
,
u2 =
,
(8.33)
1
2

1
2

obtained by dividing each eigenvector by its length: uk = vk /k vk k.


Proof of Theorem 8.21 : First note that if A = AT is real, symmetric, then
(A v) w = v (A w)

for all

v, w C n ,

(8.34)

where we use the Euclidean dot product for real vectors and, more generally, the Hermitian
dot product when they are complex. (See Exercise .)
To prove (a), suppose is a complex eigenvalue with complex eigenvalue v C n .
Consider the Hermitian dot product of the complex vectors A v and v: equation (3.80),
we find
(A v) v = ( v) v = k v k2 .
On the other hand, by (8.34), since AT = A is a real matrix, (8.20) implies
(A v) v = v (A v) = v ( v) = k v k2 ,
where the complex conjugate appears owing to the sesqui-linearity property (3.82) of the
Hermitian dot product. Equating these two expressions, we deduce
k v k2 = k v k2 .
Since v is an eigenvector, it is nonzero, v 6= 0, and so we must have = . This proves
that the eigenvalue is real.
To prove (b), suppose
A v = v,

A w = w,

where 6= are distinct eigenvalues. Then, again by (8.34),


v w = (A v) w = v (A w) = v ( w) = v w,
and hence
( ) v w = 0.
3/7/03

283

c 2003

Peter J. Olver

Since 6= , this implies that v w = 0 and hence the eigenvectors v, w are orthogonal.
Finally, the proof of (c) is easy if all the eigenvalues of A are distinct. Theorem 8.15
implies that the eigenvectors form a basis of R n , and part (b) proves they are orthogonal.
(An alternative proof starts with orthogonality, and uses Proposition 5.4 to prove that
the eigenvectors form a basis.) To obtain an orthonormal basis, we merely divide the
eigenvectors by their lengths: uk = vk /k vk k, as in Lemma 5.2. A general proof can be
found in [113].
Q.E.D.

5 4 2
Example 8.23. Consider the symmetric matrix A = 4 5
2 . A straight2
2 1
forward computation produces its eigenvalues and eigenvectors:
1 = 9,

v1 = 1 ,
0

2 = 3,

1

v2 = 1 ,
1

3 = 3,

1
v3 = 1 .
2

As the reader can check, the eigenvectors form an orthogonal basis of R 3 . The orthonormal
eigenvector basis promised by Theorem 8.21 is obtained by dividing each eigenvector by
its norm:

1
2
1
2

u1 =

u2 =

1
3
1
3
1
3

u3 =

1
6
1
6
26

Finally, we can characterize positive definite matrices by their eigenvalues.


Theorem 8.24. A symmetric matrix K = K T is positive definite if and only if all
of its eigenvalues are strictly positive.
Proof : First, if K > 0, then, by definition, xT K x > 0 for all nonzero vectors x R n .
In particular, if x = v is an eigenvector with eigenvalue , then
0 < vT K v = vT ( v) = k v k2 ,

(8.35)

which immedaitely proves that > 0. Conversely, suppose K has all positive definite
eigenvalues. Let u1 , . . . , un be the orthonormal eigenvector basis of R n guaranteed by
Theorem 8.21, with K uj = j uj . Then, writing
x = c 1 u1 + + c n un ,

we have

K x = c 1 1 u1 + + c n n un .

Therefore, using the orthonormality of the basis vectors,


xT K x = c1 u1 + + cn un (c1 1 u1 + + cn n un ) = 1 c21 + + n c2n > 0,
which is stricly positive since not all the coefficients c1 , . . . , cn can be zero. We conclude
that K is positive definite.
Q.E.D.
3/7/03

284

c 2003

Peter J. Olver

Remark : The same proof shows that A is positive semi-definite if and only if all its
eigenvalues satisfy 0.
The Spectral Theorem
The Spectral Theorem concerns the diagonalization of real, symmetric matrices. Recall first that an n n matrix Q is called orthogonal if and only if its columns form
an orthonormal basis of R n . Alternatively, one characterizes orthogonal matrices by the
condition Q1 = QT , as per Definition 5.18.
Theorem 8.25. If A is a real, symmetric matrix, then there exists an orthogonal
matrix Q such that
A = Q Q1 = Q QT ,
(8.36)
where is a real diagonal matrix, with the eigenvalues of A on the diagonal.
Proof : The proof is an immediate consequence of the diagonalization Theorem 8.19
coupled with Theorem 8.21. One merely replaces the general eigenvector matrix S by the
orthogonal matrix Q whose columns consist of our orthonormal eigenvector basis. Q.E.D.
Remark : The term spectrum of a matrix refers to its eigenvalues. This terminology
is motivated by physics. The spectral energy lines of atoms, molecules and nuclei are
characterized as the eigenvalues of the governing quantum mechanical linear operators!
Warning: The spectral diagonalization A = Q QT and the Gaussian diagonalization
A = L D LT of a regular symmetric matrix are completely different. In particular, the
eigenvalues are not the pivots: 6= D.
Example 8.26. For the 2 2 matrix

3
A=
1

1
3

considered in Example 8.22, the orthonormal eigenvectors (8.33) produce the diagonalizing
orthogonal rotation matrix
!

Q=

1
2

12

1
2
1
2

The reader can check the spectral factorization


!
1

1
4
3 1
2
2
= A = Q QT =
1
1
1 3
0
2 2

0
2

1
2
1
2

12
1
2

The spectral factorization (8.36) of A has thus provided us with an alternative means
of diagonalizing the associate quadratic form, i.e., of completing the square!
The associated quadratic form is
q(x) = 3 x21 + 2 x1 x2 + 3 x22 = 4 y12 + 2 y22 ,
3/7/03

285

c 2003

Peter J. Olver

Figure 8.3.
where y = QT x, i.e.,
y1 =

Stretching a Circle into an Ellipse.

x1 + x 2

,
2

y2 =

x1 + x2

.
2

. As a consequence of these results, a linear map L(x) = A x defined by a positive


definite matrix A can be viewed as a combination of stretches along a mutually orthogonal
set of directions. A good way to visualise this is to consider the effect of the map on the
unit (Euclidean) sphere S1 = { k x k = 1 }. Stretching the sphere along the coordinate
directions will change it into an ellipsoid E. For example, the stretch x
e = 2 x, ye = 3 y,
maps the unit circle x2 + y 2 = 1 into the ellipse 14 x
e2 + 19 ye2 = 1. The semi-axes have
respective lengths 2, 3 corresponding to the two stretching factors.
e = A x, the unit sphere
More generally, under a positive definite linear transformation x
will be mapped to an ellipsoid with orthogonal axes in the directions of the eigenvectors,

3 1
whose semi-axis lengths are the eigenvalues of A. For example, the matrix A =
1 3
considered in the preceding example represents the linear transformation
x
e = 3 x + y,

ye = x + 3 y.

The unit circle x2 + y 2 = 1 will be mapped to the ellipse

2
2
3x
e ye
x
e + 3 ye
3
+
= 10
e2 16
x
e ye +
64 x
8
8

whose semi-axes lie in the directions of the eigenvectors u1 =

10
64

ye2 = 1,

1
2
1

, u2 =

1
2
1

and are of respective lengths 4, 2 equal to the corresponding eigenvalues. The effect of this
linear transformation is illustrated in Figure 8.3.
Remark : The ellipsoid E can be identified as the unit sphere for the alternative norm
k x k2 = xT A x based on the positive definite matrix being diagonalized.
Remark : In elasticity, the stress tensor is represented by a positive definite matrix. Its
eigenvalues are known as the principal stretches and its eigenvectors the principal directions
of stretch for the body.
3/7/03

286

c 2003

Peter J. Olver

Minimization Principles
As we learned in Chapter 4, the solution to a linear system with positive definite
coefficient matrix can be characterized by a minimization principle. thus, it should come
as no surprise that eigenvalues of positive definite matrices can also be characterized by
some sort of minimization procedure. The resulting characterization carries over to positive
definite differential operators, and forms the basis of a collection of numerical algorithms
for computing eigenvalues.
First consider the case of a diagonal, positive definite matrix = diag( 1 , . . . , n ).
Positivity implies that all the diagonal entries, which are the same as the eigenvalues, are
strictly positive: i > 0. We assume that the entries appear in order,
0 < 1 2 n ,

(8.37)

so 1 is the smallest eigenvalue, while n is the largest.


The effect of on vectors y R n is to multiply their entries by the diagonal eigenT
values: y = ( 1 y1 , 2 y2 , . . . , n yn ) . In other words, the linear transformation represented by the coefficient matrix has the effect of stretching in the ith coordinate direction by the factor i . In particular, the minimal stretch occurs in the first direction, with
stretch factor 1 , while the maximal stretch occurs in the last cirection, with factor n .
The germ of the minimization (and maximization) principle for characterizing the smallest (and largest) eigenvalue is contained in this geometrical observation. We just need to
quantify it.
Consider the associated quadratic form
q(y) = yT y = 1 y12 + 2 y22 + + n yn2 .

(8.38)

Clearly, the minimal value of q(y) is 0, obtained when y = 0, and so strict minimization of
q(y) is not of much help. Suppose, however, that we try to minimize q(y) when y ranges
over a subset of possible values, namely the unit sphere consisting of all vectors y that have
Euclidean norm k y k = 1. First note that q(ei ) = i , where ei denotes the ith standard
basis vector, which does belong to the unit sphere. Moreover, in view of (8.37) and the
positivity of each yi2 ,

q(y) = 1 y12 +2 y22 + +n yn2 1 y12 +1 y22 + +1 yn2 = 1 y12 + + yn2 = 1 ,

whenever k y k2 = y12 + + yn2 = 1. Thus, the mimimal value of q(y) over all vectors of
norm 1 is
min { q(y) | k y k = 1 } = 1 ,
the smallest eigenvalue of . By the same reasoning, q(y) also has a maximal value on the
unit sphere,
max { q(y) | k y k = 1 } = n ,
which is the largest eigenvalue. Thus, we can represent the two extreme eigenvalues by
minimization/maximization principles, albeit of a slightly different character than was
considered in Chapter 4.
3/7/03

287

c 2003

Peter J. Olver

Suppose K > 0 is any positive definite matrix. We use the Spectral Theorem 8.25 to
write
K = Q QT ,
where = diag(1 , . . . , n ) is a diagonal matrix containing the (positive) eigenvalues of
K along its diagonal, written in increasing order, while Q is an orthogonal matrix whose
columns are the orthonormal eigenvector basis of K.
We now rewrite the associated quadratic form
q(x) = xT K x = xT Q QT x = yT y,

(8.39)

where y = QT x = Q1 x are the coordinates of x with respect to the orthonormal eigenvalue basis of A, cf. (7.65). According to the preceding discussion, the minimum of
(8.39) over all possible y or norm k y k = 1 is the smallest eigenvalue 1 of , which
is the same as the smallest eigenvalue of K. Moreover, since Q is an orthogonal matrix,
k x k = k Q y k = k y k = 1, cf. Proposition 7.63 or Exercise . Thus, minimizing over
all y of unit norm is the same as minimizing over all vectors x of unit norm. Thus, we
have proved the basic minimization and maximization principle for eigenvalues of positive
definite matrices.
Theorem 8.27. If K > 0 is a positive definite matrix, then

min xT K x k x k = 1 = 1

(8.40)

is the smallest eigenvalue of K. Similarly, the largest eigenvalue equals

max xT K x k x k = 1 = .

(8.41)

The minimal (maximal) value of the quadratic form is achieved when we set x = u 1
(x = un ) equal to the unit eigenvector corresponding to the smallest (largest) eigenvalue.
Example 8.28. The problem is to minimize the value of the quadratic form
q(x, y) = 3 x2 + 2 x y + 3 y 2

on the
circle x2 + y 2 = 1. This is precisely of form (8.40). the coefficient matrix is
unit
3 1
K=
, whose eigenvalues are, from Example 8.6, 1 = 2 and 2 = 4. According to
1 3
Theorem 8.27, the minimal value for the quadratic form on the unit circle is the smallest
eigenvalue, and hence equal to 2. Its maximal value on the unit circle is the largest
eigenvalue, and hence equal to 4. Indeed, if we substitute the unit eigenvectors into q(x, y)
we obtain

1 1
1 1

q 2 , 2 = 4.
q 2 , 2 = 2,

Restricting the minimization principle to just the unit vectors k x k = 1 is not always
convenient, so it would help to rephrase the result using general vectors. Recall that if
v 6= 0 is any nonzero vector, then x = v/k v k is a unit vector. Using this form for x in
the quadratic form leads to the following minimax principles:

T
v K v
v K v
(8.42)
v 6= 0 = 1 ,
max
v 6= 0 = n .
min
k v k2
k v k2
3/7/03

288

c 2003

Peter J. Olver

Thus we replace minimization of a quadratic polynomial over the unit sphere by minimization of a rational function over all of R n \ {0}. For example, the minimum of
3 x2 + 2 x y + 3 y 2
x2 + y 2
T

over all ( x, y ) 6= 0 is equal to 2, the same minimal eigenvalue of the preceding coefficient
matrix.
What about if we are interested in the intermediate eigenvalues? Then we need to be
a little more sophisticated in our minimization or maximization principles. Look first at
T
the diagonal case. If we restrict the quadratic form (8.38) to vectors y = ( 0, y 2 , . . . , yn )
whose first component is zero, we obtain
q(0, y2 , . . . , yn ) = 2 y22 + + n yn2 .
If we now minimize over all such y of norm ! we obtain, by the exact smae reasoning, the
second smallest eigenvalue 2 as the minimum. We can characterize such vectors in a more
geometrical manner by noting that they are orthogonal to the first standard basis vector,
y e1 = 0, which also happens to be the eigenvector of corresponding to the eigenvalue
1 . Similarly, if we want to find the j th smallest eigenvalue j , we minimize q(y) over
all unit vectors y whose first j 1 components vanish, y1 = = yj1 = 0, or, stated
geometrically, over all y such that k y k = 1 and y e1 = = y ej1 = 0, i.e., all vectors
orthogonal to the first j 1 eigenvectors.
A similar reasoning based on the Spectral Theorem 8.25 and the orthogonality of
eigenvectors of symmetric matrices, leads to the following result. For simplicity, we state
it assuming there are no repeated eigenvalues, but one can straightforwardly modify it to
cover the general case.
Theorem 8.29. Let K > 0 be a positive definite matrix with distinct eigenvalues
0 < 1 < 2 n and corresponding orthogonal eigenvectors v1 , . . . , vn . Then the
minimal value of the quadratic form xT K x over all unit vectors which are orthogonal to
the first j 1 eigenvectors is the j th eigenvalue:

min xT K x k x k = 1, x v1 = = x vj1 = 0 = j .
(8.43)
Remark : If you have learned about Lagrange multipliers for constrained minimization
problems, the eigenvalue plays the role of the Lagrange multiplier.

Remark : In elasticity, the stress tensor is represented by a positive definite matrix. Its
eigenvalues are known as the principal stretches and its eigenvectors the principal directions
of stretch for the body.

8.5. Linear Dynamical Systems.


Now we have accumulated enough experience with eigenvalues to analyze dynamical systems governed by linear, homogeneous ordinary differential equations. Consider a
system
du
= Au
(8.44)
dt
3/7/03

289

c 2003

Peter J. Olver

consisting of n first order linear ordinary differential equations in n unknowns u(t) =


T
( u1 (t), . . . , un (t) ) R n . The coefficient matrix A, of size n n, is assumed to be a
constant matrix.
As we saw, the pure exponential function u(t) = e t v is a (non-zero) solution if and
only if is an eigenvalue of A and v is the corresponding eigenvector. If A is complete, then
there are n linearly independent eigenvectors v1 , . . . , vn , which, along with their associated
eigenvalues 1 , . . . , n , are used to produce n distinct exponential solutions
u1 (t) = e1 t v1 ,

un (t) = en t vn .

...

(8.45)

Since the system (8.44) is linear and homogeneous, we may use our general superposition
principle, cf. Theorem 7.21, to obtain additional solutions by taking linear combinations of
known solutions. Therefore, for any choice of constants c1 , . . . , cn , the linear combination
u(t) = c1 u1 (t) + + cn un (t) = c1 e1 t v1 + + cn en t vn

(8.46)

is a solution to the linear system depending on n arbitrary constants.


Are there any other solutions? The answer is no linear independence implies
that equation (8.46) represents the most general solution to the system. This result is a
consequence of the basic existence and uniqueness theorem for linear systems of ordinary
differential equations, which we discuss next.
Example 8.30. Consider the linear system
du
= 3 u + v,
dt

dv
= u + 3 v.
dt

(8.47)

du
u(t)
We first write the system in matrix form
= A u, with unknown u =
and coefv(t)
dt

3 1
. According to Example 8.6, the eigenvalues and eigenvectors
ficient matrix A =
1 3
of A are

1
1
1 = 4,
v1 =
,
2 = 2,
v2 =
.
1
1
We use theeigenvalues and eigenvectors to construct the two particular exponential solutions

4t

1
1
e2 t
e
2t
4t
.
,
u2 (t) = e
=
u1 (t) = e
=
1
e2t
1
e4t
According to the preceding remark, to be justified below, the general solution to (8.47) is
then given as a linear combination

u(t)
1
1
c1 e4 t c 2 e2 t
4t
2t
u(t) =
= c1 e
+ c2 e
=
,
v(t)
1
1
c1 e4t + c2 e2t
where c1 , c2 are arbitrary constants.
3/7/03

290

c 2003

Peter J. Olver

Basic Theory of Linear Ordinary Differential Equations


Before proceeding further, it will help to briefly summarize the basic theory of linear
systems of ordinary differential equations. The fundamental existence and uniquness result
for homogeneous linear systems is a consequence of a more general existence and uniqueness
theorem for nonlinear systems of ordinary differential equations, to be is discussed in
Section 19.1. Even though we will only study the constant coefficient case in detail, the
result is equally applicable to homogeneous linear systems with variable coefficients, and
so we allow the coefficient matrix to depend continuously on t.
A system of n first order ordinary differential equations requires n initial conditions
one for each variable in order to specify its solution uniquely. More specifically:
Theorem 8.31. Let A(t) be an n n matrix of continuous functions of t for all
a < t < b. Given an initial time a< t0 < b and an initial vector b R n , the initial value
problem
du
(8.48)
= A(t) u,
u(t0 ) = b,
dt
has a unique solution u(t) defined for all a < t < b.
In particular, for our constant coefficient system (8.44), there is a unique solution u(t)
that is defined for all < t < for any choice of initial conditions. In particular, the
uniqueness of solutions implies that the solution which has zero initial conditions u(t 0 ) = 0
is the trivial zero solution u(t) 0.
The next result tells us how many different solutions we need in order to construct
the general solution to our system by linear superposition.
Theorem 8.32. If u1 (t), . . . , un (t) are n linearly independent solutions to the homogeneous system of n first order linear ordinary differential equation
du
= A(t) u,
dt

(8.49)

u(t) = c1 u1 (t) + + cn un (t),

(8.50)

then the general solution is

for arbitrary constants c1 , . . . , cn .


Proof : We begin by noting that, for solutions of first order linear ordinary differential
equations, we only need to check linear independence at a single point.
Lemma 8.33. The solutions u1 (t), . . . , uk (t) to the first order linear system (8.49)
are linearly independent functions if and only if the vectors u1 (t0 ), . . . , uk (t0 ) obtained by
evaluating the solutions at a single time t0 are linearly independent.
Proof : Given any constants c1 , . . . , ck , superposition tells us that the linear combination
u(t) = c1 u1 (t) + + ck uk (t)
3/7/03

291

(8.51)
c 2003

Peter J. Olver

is a solution to the system. Now, if u1 (t0 ), . . . , uk (t0 ) are linearly dependent at the point
t0 , then we can find c1 , . . . , ck not all zero such that
u(t0 ) = c1 u1 (t0 ) + + ck uk (t0 ) = 0.
By the uniqueness Theorem 8.31, the corresponding solution
u(t) = c1 u1 (t) + + ck uk (t) 0.

(8.52)

must be identically zero since it has zero initial conditions: u(t0 ) = 0. This proves that
the solutions u1 (t), . . . , uk (t) are linearly dependent. On the other hand, if (8.52) holds for
all t, then it holds at any particular t0 , and so linear dependence of the solutions implies
the same of the vectors u1 (t0 ), . . . , uk (t0 ).
Q.E.D.
Warning: This result is not true if the functions are not solutions to a first order
system! For example,

1
cos t
u1 (t) =
,
u2 (t) =
,
t
sin t
T

are linearly independent vector-valued functions, but u1 (0) = ( 1, 0 ) = u2 (0) are linearly
dependent vectors. It is even possible for functions to give linearly independent vectors at
each individual time, while the functions themselves are linearly dependent. A simple example is when u1 (t) = f (t) v and u2 (t) = g(t) v, where v is a constant vector and f (t), g(t)
are any two linearly independent scalar functions. At each t0 , the vectors u1 (t0 ), u2 (t0 ) are
linearly dependent, while u1 (t), u2 (t) are linearly independent functions. WQe conclude
that not every collection of functions can satisfy a (nonsingular) linear system.
Returning to the proof of Theorem 8.32, if we have n linearly independent solutions,
then Lemma 8.33 implies that, at the time t0 , the vectors u1 (t0 ), . . . , un (t0 ) are linearly
independent, and hence form a basis for R n . This means that we can express any initial
condition
u(t0 ) = b = c1 u1 (t0 ) + + cn un (t0 )
as a linear combination thereof. Superposition implies that the corresponding solution to
the initial value problem (8.48) is given by the same linear combination
u(t) = b = c1 u1 (t) + + cn un (t).
We conclude that every solution to the ordinary differential equation can be written in the
form (8.50).
Q.E.D.
Complete Systems
In the case of a constant coefficient linear system with a complete coefficient matrix,
we have already determined n linearly independent exponential solutions constructed from
the eigenvalues and eigenvectors. Theorem 8.32 tells us that we know the most general
solution.
3/7/03

292

c 2003

Peter J. Olver

Theorem 8.34. If the n n matrix A is complete, then the general (complex)

solution to the constant coefficient linear system u = A u is given by


u(t) = c1 e1 t v1 + + cn en t vn ,

(8.53)

where v1 , . . . , vn are the linearly independent eigenvectors and 1 , . . . , n the corresponding eigenvalues.
Proof : It suffices to verify that the solutions (8.45) are linearly independent. But
since the eigenvectors are linearly independent, the solutions define linearly independent
vectors uk (0) = vk time t = 0. Thus, Lemma 8.33 implies that the functions uk (t) are,
indeed, linearly independent.
Q.E.D.
Let us apply our formula (8.53) to solve the general initial value problem. Substituting
into the initial conditions, we must have
u(t0 ) = c1 e1 t0 v1 + + cn en t0 vn = b.
Applying our handy matrix formula (2.15), this takes the form of a linear system
S y = b,

where

y i = c i e i t0 ,

and where S = (v1 , . . . , vn ) is the eigenvector matrix constructed earlier. Theorem 8.15
implies that S is invertible, and so we can uniquely solve for y = S 1 b and hence for each
coefficient ci = yi e i t0 . Therefore, the unique solution to any initial condition can be
written in the form (8.53) for some choice of the constants c1 , . . . , cn .
Example 8.35. Consider the initial value problem

u1 = 2 u 1 + u2 ,

u2 = 2 u 1 3 u 2

u1 (0) = 2,

(8.54)

u2 (0) = 0.

2 1
The system takes the form (8.44), where the coefficient matrix is A =
. A
2 3
straightforward computation produces the following eigenvalues and eigenvectors of A:


1
1
1 = 4,
v1 =
,
2 = 1,
v2 =
.
2
1
Therefore, the general solution to (8.54) is


c1 e 4 t + c 2 e t
u1 (t)
1
1
4t
t
= c1 e
+ c2 e
=
,
u(t) =
2 c 1 e 4 t + c 2 e t
u2 (t)
2
1

where c1 , c2 are arbitrary constants. Substituting this expression into the initial conditions,
we find we need to solve the linear system
c1 + c2 = 3,

2 c1 + c2 = 0,

for c1 = 1, c2 = 2. Thus, the (unique) solution to the initial value problem is


u1 (t) = e 4 t + 2 e t ,

u2 (t) = 2 e 4 t + 2 e t .

(8.55)

Note that both components of the solution decay exponentially fast to 0 as t .


3/7/03

293

c 2003

Peter J. Olver

Example 8.36. Consider the linear system

u1 = u1 + 2 u 2 ,
The coefficient matrix is

u2 = u 2 2 u 3 ,

1
A = 0
2

u3 = 2 u 1 + 2 u 2 u3 .

(8.56)

2 0
1 2 .
2 1

In Example 8.9 we computed the eigenvalues and eigenvectors:


1 = 1,

1
v1 = 1 ,
1

2 = 1 + 2 i ,

1

v2 = i ,
1

3 = 1 2 i ,

1
v3 = i .
1

The first leads to a real solution, but the second and third lead to complex solutions to
our real system of equations; for example

1
(1+2 i ) t
b 2 (t) = e
i .
u
1

While this is a perfectly valid complex solution, it is not so convenient to work with if,
as in most applications, we are primarily interested in real-valued solutions to our system.
According to the general reality principle of Theorem 7.38, any complex solution to a
real, homogeneous, linear system can be broken up into its real and imaginary parts, each
of which is a real solution to the linear system. The key is Eulers formula (3.74) for a
complex exponential, where we write
e(1+2i)t = et cos 2 t + i et sin 2 t,
and so

1
e cos 2 t
e sin 2 t
b 2 (t) = e(1+2i)t i = et sin 2 t + i et cos 2 t .
u
1
et cos 2 t
et sin 2 t

The latter two real vector functions are individually solutions to the system (8.56), as the
reader can readily check. Therefore, we have produced three linearly independent real
solutions to our system:
t

t
e sin 2 t
e cos 2 t
e
u3 (t) = et cos 2 t .
u2 (t) = et sin 2 t ,
u1 (t) = et ,
t
t
et sin 2 t
e cos 2 t
e
Theorem 8.32 tells us that the general solution is a linear combination of the 3 independent
solutions:

c1 et + c2 et cos 2 t + c3 et sin 2 t
u(t) = c1 u1 (t) + c2 u2 (t) + c3 u3 (t) = c1 et c2 et sin 2 t + c3 et cos 2 t .
c1 et + c2 et cos 2 t + c3 et sin 2 t
3/7/03

294

c 2003

Peter J. Olver

Incidentally, the third complex solution also produces two real solutions, but these
are already dependent upon the ones we have listed. In fact, since 3 = 2 is the complex
conjugate of the eigenvalue 2 , its eigenvector v3 = v2 is also the complex conjugate of
the eigenvector v2 , and, finally, the solutions are also related by complex conjugation:

1
e cos 2 t
e sin 2 t
b 2 (t) .
b 3 (t) = e(12i)t i = et sin 2 t i et cos 2 t = u
u
1
et cos 2 t
et sin 2 t
Therefore, when using complex eigenvalues to construct solutions to real linear ordinary
differential equations, one only need look at one eigenvalue from each complex conjugate
pair.
The General Case
If the matrix A is not complete, then the formulae for the solutions are a little more
intricate, and require polynomials as well as (complex) exponentials. The full details
appear in Appendix D, but the following general characterization of the solutions in such
situations will be used in the following sections.
Theorem 8.37. The general real solution to any constant coefficient homogeneous

linear system u = A u is a linear combination of n linearly independent solutions of the


following types:
(a) If is a complete real eigenvalue of multiplicity m, then there exist m linearly
independent solutions of the form
u(t) = e t vk ,

k = 1, . . . , m,

where v1 , . . . , vm are linearly independent eigenvectors for the eigenvalue .


(b) If i form a pair of complete complex conjugate eigenvalues of multiplicity m,
then there exist 2 m linearly independent real solutions of the forms

u(t) = e t cos t wk sin t zk ,

k = 1, . . . , m,
b (t) = e t sin t wk + cos t zk ,
u

where vk = wk i zk are the associated complex conjugate eigenvectors.


(c) If is an incomplete real eigenvalue of multiplicity m and r = dim V , then there
exist m linearly independent solutions of the form
u(t) = e t pk (t),

k = 1, . . . , m,

where pk (t) is a vector of polynomials of degree m r.


(d) If i form a pair of incomplete complex conjugate eigenvalues of multiplicity
m and r = dim V , then there exist 2 m linearly independent real solutions of the
forms

u(t) = e t cos t pk (t) sin t qk (t) ,

k = 1, . . . , m,
b (t) = e t sin t pk (t) + cos t qk (t) ,
u
where pk (t), qk (t) are vectors of polynomials of degree m r.

3/7/03

295

c 2003

Peter J. Olver

Corollary 8.38. Every real solution to a homogeneous linear system of ordinary


differential equations is a vector-valued function whose entries are linear combinations of
functions of the particular form
tk e t cos t

and

tk e t sin t,

for

k = 0, 1, . . . , l,

(8.57)

, real,

i.e., sums of products of exponentials, trigonometric functions and polynomials. The exponential coefficients are the real parts of the eigenvalues of the coefficient matrix; the
trigonometric frequencies are the imaginary parts of the eigenvalues; nonconstant polynomials only appear if the matrix is incomplete.
Example 8.39. The incomplete cases should remind the reader of the solution to a
single scalar ordinary differential equation in the case of a repeated root to the characteristic equation. For example, to solve the second order equation
d2 u
du
2
+ u = 0,
2
dt
dt

(8.58)

we substitute the exponential ansatz u = e t , leading to the characteristic equation


2 2 + 1 = 0.
There is only one double root, = 1, and hence, up to scalar multiple, only one exponential
solution u1 (t) = et . In the scalar case, the second missing solution is obtained by just
multiplying by t, so that u2 (t) = t et . The general solution is
u(t) = c1 u1 (t) + c2 u2 (t) = c1 et + c2 t et .
The equivalent phase plane system (8.11) is
du
=
dt

0
1

1
2

u.

Note that the coefficient matrix is incomplete it has = 1 as a double eigenvalue,


T
but only one independent eigenvector, namely v = ( 1, 1 ) . The two linearly independent
solutions to the system can be constructed from the solutions u1 (t) = et , u2 (t) = t et to
the original equation, and so
t

e
t et
u1 (t) =
,
u2 (t) =
.
(8.59)
et
t et + et
Note the appearance of the polynomial factor t in the solution formula. The general
solution is obtained as a linear combination of these two basic solutions.
Warning: In (8.59), the second vector solution u2 is not obtained from the first by
merely multiplying by t. Incomplete systems are not that easy to handle!
3/7/03

296

c 2003

Peter J. Olver

8.6. Stability of Linear Systems.


Now that we have a good understanding of the formulae for the solutions to first order
linear systems, we are in a good position to investigate their qualitative features. We are
particularly interested in stability properties. We begin with a simple calculus lemma
whose proof is left to the reader.
Lemma 8.40. Let , be real and k 0 an integer. A function of the form
f (t) = tk e t cos t

or

tk e t sin t

(8.60)

will decay to zero for large t, so lim f (t) = 0, if and only if < 0. The function remains
t

bounded, | f (t) | C for all t 0, if and only if either < 0, or = 0 and k = 0.


In other words, exponential decay will always cancel out polynomial growth, while
trigonometric functions always remain bounded. Now, in the solution to our ordinary
differential equation, the functions (8.60) come from the eigenvalues = + i of the
coefficient matrix. The lemma implies that the asymptotic behavior of the solutions, and
hence their stability, depends on the sign of = Re . If < 0, then the solutions decay
to zero at an exponential rate as t . If > 0, then the solutions become unbounded
as t . in the borderline case = 0, the solutions remain bounded as along as no
positive powers of t appear in the solution.
Asymptotic stability of the zero solution requires that all other solutions decay to 0
as t , and hence all the eigenvalues must satisfy = Re < 0. Or, stated another
way, all eigenvalues must lie in the left half plane the subset of C to the left of the
imaginary axis. Thus, we have demonstrated the basis asymptotic stability criterion for
linear systems.
Theorem 8.41. A first order linear, homogeneous, constant-coefficient system of
ordinary differential equations (8.44) has asymptotically stable zero solution if and only if
all the eigenvalues of the coefficient matrix lie in the left half plane: Re < 0.
Example 8.42. Consider the system
du
= 2 u 6 v + w,
dt

dv
dw
= 3 u 3 v w,
= 3 u v 3 w.
dt
dt

2 6 1
The coefficient matrix A = 3 3 1 is found to have eigenvalues 1 = 2, 2 =
3 1 3

1 + i 6 , 3 = 1 i 6 , with respective real parts 2, 1, 1. Therefore, according


to Theorem 8.41, the zero solution u v w 0 is asymptotically
stable. Indeed,
the
2t
t
t
solutions involve linear combinations of the functions e
, e cos 6 t, and e sin 6 t,
all of which decay to 0 at an exponential rate. The slowest to decay are the latter two, and
so most solutions to the linear system decay in proportion to e t , i.e., at an exponential
rate equal to the least negative real part.
3/7/03

297

c 2003

Peter J. Olver

A particularly important class of systesm are the linear gradient flows


du
= K u,
dt

(8.61)

in which K > 0 is a symmetric, positive definite matrix. According to Theorem 8.24,


all the eigenvalues of K are real and positive. Moreover, by Exercise , the eigenvalues
K are the negatives of the eigenvalues of K, and are therefore real and negative. We
conclude, via Theorem 8.41, that the zero solution to any gradient flow system (8.61) with
negative definite coefficient matrix K is asymptotically stable.

1 1
is found
Example 8.43. Using the methods of Chapter 3, the matrix K =
1 5
to be positive definite. The associated gradient flow is
dv
= u 5 v.
dt

1 1
are
The eigenvalues and eigenvectors of K =
1 5

1
1 = 3 + 5 ,
,
2 = 3 5 ,
v1 =

2+ 5
du
= u v,
dt

(8.62)

v2 =

2 5

Therefore, the general solution to the system is


u(t) = c1 e(3+

5 )t

v1 + c2 e(3

5 )t

v2 ,

or, in components,
u(t) = c1 e(3+

v(t) = c1 (2 +

5 )t

+ c2 e(3

5) e(3+

5 )t

5 )t

+ c2 (2

5) e(3

5 )t

The solutions clearly tendto zero as t at an exponential rate prescribed by the least
negative eigenvalue 3 + 5 = 0.7639 . . . , confirming the global asymptotic stability of
the zero equilibrium solution.
The reason for the term gradient flow is that the vector
K u = q(u)
appearing on the right hand side of (8.61) is, in fact, the gradient of the quadratic function
q(u) =

1
2

n
1 X
u Ku =
k u u .
2 i,j = 1 ij i j
T

(8.63)

du
= q(u).
dt

(8.64)

Thus, we can write (8.61) as

3/7/03

298

c 2003

Peter J. Olver

For the particular system (8.62),


q(u, v) =

1
2

u2 + u v + 52 v 2 ,

and so the gradient flow is given by


q
dv
q
du
=
= u v,
=
= u 5 v.
dt
u
dt
v
In applications, the quadratic function (8.63) often represents the energy in the system. Its
negative gradient q points in the direction of steepest decrease of q. (See Chapters A
and B and Section 18.3 for a detailed discussion of this point.) Therefore, the solutions
of (8.64) are decreasing the energy function q(u) as fast as possible. The solutions seek
the minimum of q(u), which is at the equilibrium position u = 0. Thus, in physical
applications, a the stability of the gradient flow models the natural behavior of systems
that are minimizing their energy.
Extension of the asymptotic stability criterion of Theorem 8.41 to stable equilibria is
not difficult.
Theorem 8.44. A first order linear, homogeneous, constant-coefficient system of ordinary differential equations (8.44) has stable zero solution if and only if all the eigenvalues
satisfy Re 0, and, moreover, any eigenvalue lying on the imaginary axis, Re = 0, is
complete, meaning that it has as many independent eigenvectors as its multiplicity.
Proof : The proof is the same as above, using Corollary 8.38 and the decay properties
in Lemma 8.40. If a purely imaginary eigenvalue is complete, then the associated solutions
only involve trigonometric functions, and hence remain stable. On the other hand, solutions
for incomplete eigenvalues also contain powers of t, and hence cannot remain bounded as
t . Thus, the zero solution is not stable when such eigenvalues are incomplete. Details
can be found in Appendix D.
Q.E.D.
Example 8.45. A Hamiltonian system in the plane takes the form
du
H
dv
H
=
,
=
,
dt
v
dt
u
where H(u, v) is known as the Hamiltonian function for the system. If
H(u, v) =

1
2

a u2 + b u v +

1
2

c v2

(8.65)

(8.66)

is a quadratic function, then the Hamiltonian system (8.65) is linear

v = a u b v,
u = b u + c v,

b
c
. The characteristic equation is
with coefficient matrix A =
a b

(8.67)

det(A I ) = 2 + (a c 2 ) = 0.

2
If H is positive definite, then ac
p > 0, and so the roots of the characteristic equation
are purely imaginary: = i a c 2 . Since the eigenvalues are simple, the stability
criterion of Theorem 8.44 holds and we conclude that planar Hamiltonian systems with
positive definite Hamiltonian are stable.

3/7/03

299

c 2003

Peter J. Olver

8.7. Two-Dimensional Systems.


The two-dimensional case is particularly instructive, since many of the most important
phenomena already manifest themselves. Moreover, the solutions can be easily pictured
by their phase portraits. Setting u(t) = (u(t), v(t))T , the system (8.44) has the explicit
form
dv
du
(8.68)
= a u + b v,
= c u + d v,
dt
dt

a b
is the coefficient matrix. As in Section 8.1, we will refer to the (u, v)
where A =
c d
plane as the phase plane. In particular, phase plane equivalents (8.11) of second order
scalar equations form a special class of such two-dimensional systems.
According to (8.24), the characteristic equation for the given 2 2 matrix is
det(A I ) = 2 + = 0,

(8.69)

where
= tr A = a + d,

= det A = a d b c,

are, respectively, the trace and the determinant of A. The nature of the eigenvalues, and
hence the solutions, is therefore almost entirely determined by these two quantities. The
sign of the discriminant
= 2 4 = (tr A)2 4 det A = (a d)2 4 b c

(8.70)

determines whether the roots or eigenvalues


=

1
2

(8.71)

are real or complex, and thereby plays a key role in the classification.
Let us summarize the different possibities as classified by their qualitative behavior.
Each situation will be illustrated by a representative phase portrait, which plots the curves
parametrized by the solutions (u(t), v(t)) in the phase plane. A complete taxonomy appears
in Figure 8.3 below.
Distinct Real Eigenvalues
The coefficient matrix A has two real, distinct eigenvalues 1 < 2 if and only if the
discriminant (8.70) of the quadratic equation (8.69) is positive: > 0. In this case, the
real solutions take the exponential form
u(t) = c1 e1 t v1 + c2 e2 t v2 ,

(8.72)

where v1 , v2 are the eigenvectors and c1 , c2 are arbitrary constants, to be determined by the
initial conditions. The asymptotic behavior of the solutions is governed by the size of the
eigenvalues. Let Vk = {c vk }, k = 1, 2, denote the eigenlines, i.e., the one-dimensional
eigenspaces associated with each eigenvalue k .
There are five qualitatively different cases, depending upon the signs of the two eigenvalues. These are listed by their descriptive name, followed by the required conditions on
the trace, determinant and discriminant of the coefficient matrix.
3/7/03

300

c 2003

Peter J. Olver

I a. Stable Node:

> 0,

tr A < 0,

det A > 0.

If 1 < 2 < 0 are both negative, then 0 is an asymptotically stable node. The
solutions all tend to 0 as t . Since the first exponential e1 t decreases much faster
than the second e2 t , the first term in the solution (8.72) will soon become negligible, and
hence u(t) c2 e2 t v2 when t is large. Therefore, all solutions with c2 6= 0 will arrive at
the origin along a curve that is tangent to the eigenline V2 . But the solutions with c2 = 0
come in to the origin directly along the eigenline V1 , and at a faster rate. Conversely, as
t , the solutions all go off to . In this case, the first exponential grows faster than
the second, and so the solutions u(t) c1 e1 t v1 for t 0. Thus, in their escape to ,
the solution trajectories become more and more parallel to the eigenline V 1 except for
those with c1 = 0 that go out along the eigenline V2 .
I b. Saddle Point:

> 0,

det A < 0.

If 1 < 0 < 2 , then 0 is an unstable saddle point. Solutions (8.72) with c2 = 0 start
out on the eigenline V1 and go in to 0 as t , while solutions with c1 = 0 start on V2
and go to 0 as t 7 . All other solutions go off to at both large positive and large
negative times. As t + , the solutions approach the eigenline V2 , while as t ,
they asymptote to V1 .
For a saddle point, the eigenline V1 is called the stable manifold , indicating that
solutions that start on it eventually go to the equilibrium point 0, while V 2 is the unstable
manifold , meaning that solutions on it go to equilibrium as t .

I c. Unstable Node:

> 0,

tr A > 0,

det A > 0.

If 0 < 1 < 2 are both positive, then 0 is an unstable node. The phase portrait is the
same as that of a stable node, but the solution trajectories are traversed in the opposite
direction. In other words, reversijng time by replacing t by t will convert an unstable
node into a stable node and vice versa; see Exercise . Thus, in the unstable case, the
solutions all tend to 0 as t and off to as t . Except for the solutions on the
eigenlines, they asymptote to V1 as t , and become parallel to V2 as t .
I d. Stable Line:

> 0,

tr A < 0,

det A = 0.

If 1 < 2 = 0, then every point on the eigenline V2 associated with the zero eigenvalue
is an equilibrium point. Every other solution moves along a straight line parallel to V 1 and
tends to a single point on V2 as t .
I e. Unstable Line:

> 0,

tr A > 0,

det A = 0.

This is merely the time reversal of a stable line. If 0 = 1 < 2 , then every point on
the eigenline V1 is an equilibrium. Every other solution moves off to along a straight
line parallel to V2 as t , and tends to a single point on V1 as t .
Complex Conjugate Eigenvalues
The coefficient matrix A has two complex conjugate eigenvalues

= i ,
where
= 21 = 12 tr A,
= ,
3/7/03

301

c 2003

Peter J. Olver

if and only if its discriminant is negative: < 0. In this case, the real solutions can be
written in the phaseamplitude form (2.7):
u(t) = r e t [ cos( t ) w + sin( t ) z ] ,

(8.73)

where v = w i z are the complex eigenvectors. As noted above, the two vectors w, z
are always linearly independent. There are three subcases, depending upon the sign of the
real part , or, equivalently, the sign of the trace of A.
II a. Stable Focus:

< 0,

tr A < 0.

If < 0, then 0 is an asymptotically stable focus. As t , the solutions all spiral in


to 0 with frequency meaning it takes time 2/ for the solution to go once around
the origin. As t , the solutions spiral off to with the same frequency.
II b. Center :

< 0,

tr A = 0.

If = 0, then 0 is a center . The solutions move periodically around elliptical orbits,


with frequency and period 2/. In particular, solutions that start out near 0 stay
nearby, and hence a center is a stable, but not asymptotically stable, equilibrium.
II c. Unstable Focus:

< 0,

tr A > 0.

If > 0, then 0 is an unstable focus. The phase portrait is the time reversal, t 7 t,
of a stable focus, with solutions spiralling off to as t and in to the origin as
t , again with a common frequency .
Incomplete Double Real Eigenvalue
The matrix will have a double real eigenvalue = 12 = 21 tr A if and only if the
discriminant = 0. The formula for the solutions depends on whether the eigenvalue
is complete or not. If the eigenvalue has only one independent eigenvector v, then the
solutions are no longer given by just exponentials. The general formula is
u(t) = (c1 + c2 t)e t v + c2 e t w,

(8.74)

where (A I )w = v. The vector w is known as a generalized eigenvector of A and is


not parallel to v; see ezidre2 for details. We let V = {c v} denote the eigenline associated
with the genuine eigenvector v.
III a. Stable Improper Node:

= 0,

tr A < 0,

A 6= I .

If < 0 then 0 is an asymptotically stable improper node. Since t e t is larger than


e for t > 1, the solutions u(t) c2 t e t tend to 0 as t along a curve that is tangent
to the eigenline V . Similarly, as t , the solutions go off to , again the same term
dominates. The solutions become more and more parallel to V , but moving away in the
opposite diretion from their approach.
t

III b. Linear Motion:

= 0,

tr A = 0,

A 6= I .

If = 0, then, as in case I d , every point on the eigenline V is an equilibrium point.


Every other solution moves along a straight line parallel to V , going off to in either
direction. The origin is an unstable equilibrium point.
3/7/03

302

c 2003

Peter J. Olver

III c. Unstable Improper Node:

= 0,

tr A > 0,

A 6= I .

If > 0, then 0 is an unstable improper node. The phase portrait is the time reversal
of the stable improper node.
Complete Double Real Eigenvalue
In this case, every vector in R 2 is an eigenvector, and so the real solutions take the
form u(t) = e t v, where v is an arbitrary constant vector. In fact, this case occurs if and
only if A = I is a multiple of the identity matrix.
IV a. Stable Star :

A = I,

< 0.

If < 0 then 0 is a globally asymptotically stable star. The solutions all move along
rays emanating from the origin, and go to 0 as t .
IV b. Trivial :

A = O.

If = 0 then the only possibility is A = O. Now every solution is constant and every
point is a (stable) equilibrium point. Nothing happens! This is the only case not pictured
in Figure 8.4.
IV c. Unstable Star :

A = I,

< 0.

If > 0 then 0 is unstable. The phase portrait is the time reversal of the stable star,
and so the solutions move along rays, and tend to 0 as t .
Figure 8.5 plots where the different possibilities lie, as prescribed by the trace and
determinant of the coefficient matrix. The horizontal axis indicates the value of = tr A,
while the vertical axis refers to = det A. Points on the parabola 2 = 4 are the cases
with vanishing discriminant = 0, and correspond to either stars or improper nodes
except for the origin which is either linear motion or trivial. All the asymptotically stable
cases lie in the shaded upper left quadrant where tr A < 0 and det A > 0. The borderline
points on the two axes are either stable centers, when tr A = 0, det A > 0, or stable lines,
when tr A < 0, det A = 0, or the origin, which may or may not be stable depending upon
whether A is the zero matrix or not. All other values for the trace and determinant result
in unstable equilibria.
Remark : Time reversal t t changes the coefficient matrix A A, and hence
, while the determinant = det A = det( A) is unchanged. Thus, the effect is to
reflect the Figure 8.5 through the vertical axis, interchanging the stable regions and their
unstable counterparts, while saddle points remain saddle points.
Remark : In physical applications, the coefficient matrix A is usually not known exactly, and so the physical system may be a slight perturbation of the mathematical model.
Thus, it is important to know which systems are structurally stable, meaning the basic
qualitative features are preserved under sufficiently small changes in the system.
Now, a small perturbation will alter the entries of the coefficient matrix slightly,
and hence move the trace and determinant by a comparably small amount. The net
effect is to slightly perturb its eigenvalues. Therefore, the question of structural stability
3/7/03

303

c 2003

Peter J. Olver

Ia. Stable Node

Ib. Saddle Point

Ic. Unstable Node

IIa. Stable Focus

IIb. Center

IIc. Unstable Focus

IIIa. Stable Improper Node

IIIb. Linear Motion

IIIc. Unstable Improper Node

IVa. Stable Star

IVc. Unstable Star

Figure 8.4.

3/7/03

304

Id. Stable Line

Ie. Unstable Line

Phase Portraits.

c 2003

Peter J. Olver

determinant

stable spiral

unstable spiral
center

stable node
stable line

unstable node
unstable line

trace

saddle point

Figure 8.5.

Stability Regions for TwoDimensional Linear Systems.

reduces to whether the eigenvalues have moved sufficiently far to send the system into a
different stability regime. Asymptotically stable systems remain stable under small enough
perturbations, since the property that the eigenvalues have negative real parts is preserved
under small perturbation. For a similar reason, unstable systems remain unstable under
small perturbations. On the other hand, a borderline stable system either a center
or the trivial system could become either asymptotically stable or unstable under an
adverse perturbation.
Structural stability requires more, since the overall phase portrait should not significantly change. A system in any of the open regions in the Stability Figure 8.5, e.g., a stable
spiral, unstable node, saddle point, etc., is structurally stable, whereas a system that lies
on the parabola 2 = 4 , or the horizontal axis, or positive vertical axis, e.g., an improper
node, a stable line, etc., is not, since a small perturbation could send it into either of
the adjoining regions. In other words, structural stability requires that the eigenvalues be
distinct, and have non-zero real part Re 6= 0. This final result also applies to systems in
higher dimensions, [65].

8.8. Dynamics of Structures and Circuits.


In Chapter 6, we discussed the equations describing the equilibrium configurations of
mass-spring chains and, more generally, structures made out of elastic bars. We are now
in a position to analyze the dynamical motions of such structures. Consider first a linear
mass/spring chain consisting of n masses m1 , . . . , mn connected together and, possibly,
to the top and bottom supports by springs. Let ui (t) denote the displacement from

As in Section 6.1, the masses are only allowed to move in the direction of the chain, that is,

3/7/03

305

c 2003

Peter J. Olver

equilibrium of the ith mass, and ej (t) the elongation of the j th spring. Since we are now
interested in dynamics, both of these are allowed to depend on time.
The motion of each mass is governed by Newtons Second Law,
Force = Mass Acceleration.

(8.75)

The acceleration of the ith mass is the second derivative ui = d2 ui /dt2 of its displacement.

The right hand sides of Newtons Law are thus mi ui , which we collect together in vector

form M u by multiplying the second derivative of the displacement vector u(t) by the
diagonal, positive definite mass matrix M = diag(m1 , . . . , mn ). The masses of the springs
are assumed to be negligible in this approximation.
If, to begin with, we assume no external forcing of the mass/spring system and no
frictional effects, then the only force exerted on each mass is due to the elongations of its
two connecting springs, which is measured by the components of the internal force vector
F = K u = AT C A u,

(8.76)

where A denotes the (reduced) incidence matrix, C the diagonal matrix of spring constants,
and K = AT C A the stiffness matrix for the chain, cf. (6.9).
Substituting the force formula (8.76) into Newtons Law (8.75) produces the fundamental dynamical equations
d2 u
= K u = AT C A u
(8.77)
dt2
governing the free, frictionless motions of the system. The goal is to understand the solutions of this system of second order ordinary differential equations, and then, rather
straightfowardly, generalize the methods to cover both electrical circuits as well as structures in two and three dimensions, all of which are governed by the same basic second
order system (8.77) based on the appropriate reduced incidence matrix A and constitutive
matrix C.
M

Example 8.46. The simplest case is that of a single mass connected to a fixed
support by a spring. The dynamical system (8.77) reduces to a scalar equation
d2 u
+ k u = 0.
(8.78)
dt2
Here m > 0 is the mass , while k > 0 is the springs stiffness. The general solution to this
simple homogeneous, second order linear ordinary differential equation is
r
k
u(t) = c1 cos t + c2 sin t = r cos( t ),
where
=
(8.79)
m
m

is the natural frequency of vibration. We have used the phase-amplitude equation (2.7)
to rewrite the solution as a single cosine with a phase lag . The motion is periodic, with

we only consider one-dimensional motion.

3/7/03

306

c 2003

Peter J. Olver

period P = 2 /. The frequency formula tells us that stiffer the spring or the lighter the
mass, the faster the vibrations. Take note of the square root; for instance, quadrupling
the mass slows down the vibrations by a factor of two.
The constants c1 , c2 or their phase-amplitude counterparts r, are determined
by the initial conditions. Physically, we need to specify both an initial position and an
initial velocity in order to uniquely prescribe the subsequent motion of the system:

(8.80)

u(t0 ) = b.

u(t0 ) = a,

The resulting solution is most conveniently written in the form


u(t) = a cos (t t0 ) +

b
sin (t t0 ) = r cos (t t0 )

which has amplitude and phase given by


r
2
r = a2 + 2 ,

= tan1

b
.
a

(8.81)

(8.82)

A typical solution is plotted in Figure ampphasev .


Just as exponentials form the basic building blocks for the solution of systems of first
order ordinary differential equations, trigonometric functions form the basic building blocks
for solutions to undamped mechanical (and electrical) vibrations governed by second order
systems. For simplicity, let us first assume that all the masses are all the same and equal
to 1 (in some appropriate units), so that (8.77) reduces to
d2 u
= K u.
dt2

(8.83)

Mimicking our success in the first order case, let us try substituting a trigonometric ansatz
u(t) = (cos t) v,

(8.84)

with v 6= 0 a constant vector, into the system (8.83). Differentiating (8.84) directly, we
find
d2 u
du
= (sin t) v,
= 2 (cos t) v.
dt
dt2
Therefore, our ansatz (8.84) will solve (8.83) if and only if
K v = 2 v,
which means that v is an eigenvector of K with eigenvalue
= 2 .

(8.85)

Now, there is nothing special about the cosine function the same computation also
applies to the sine function, and tells us that u(t) = (sin t) v is also a solution whenever
v is an eigenvector with eigenvalue = 2 . Summarizing:
3/7/03

307

c 2003

Peter J. Olver

2
1
-1
-2

2
1
20

40

60

cos t + cos

80

20

-1
-2

60

80

cos t + cos 73 t

5t

Figure 8.6.

40

QuasiPeriodic and Periodic Functions.

Lemma 8.47. If v is an eigenvector of the matrix K with eigenvalue = 2 , then


the trigonometric vector functions u(t) = (cos t) v and u(t) = (sin t) v are solutions to

the second order system u = K u.


Remark : Alternatively, one can utilize the complex exponential solutions e i t v and
e i t v, which are related to the trigonometric solutions via Eulers formula (3.74). This
is common practice in electrical circuit analysis although electrical engineers tend to
use j instead of i to denote the square root of 1.
Stable Structures
Let us next consider the motio of a stable structure, of the type introduced in Section 6.3. As with the chains, the masses are assumed to be concentrated at the nodes,
and the bars have negligible mass. According to Theorem 6.8, stability requires that the
reduced stiffness matrix be positive definite: K > 0 . Theorem 8.25 says that all the
eigenvalues of K are strictly positive, i > 0, which is good, since
p it implies that the eigenvalue/frequency relation (8.85) gives real frequencies i = i . Moreover, all positive
definite matrices are complete, and so, even when there are fewer than n different eigenvalues, there always exist a complete system of n linearly independent real eigenvectors
that form an orthogonal basis for R n .
Since (8.83) is a second order system of homogeneous linear equations in n unknowns,
we require 2 n linearly independent solutions. Lemma 8.47 produces 2 independent solutions for each positive eigenvalue, and hence there are indeed 2 n linearly independent
solutions,
p
ui (t) = (cos i t ) vi = (cos i t )vi ,
,
i = 1, . . . , n,
(8.86)
p
e i (t) = (sin i t ) vi = (sin i t )vi ,
u

governed by the n mutually orthogonal (or even orthonormal) eigenvectors v 1 , . . . , vn of


K. The general solution to (8.83) is an arbitrary linear combination,
u(t) =

n
X

ci cos i t + di sin i t vi =

i=1

n
X

i=1

ri cos(i t i ) vi ,

(8.87)

of these 2 n basic solutions. The 2 n coefficients ci , di or their phaseamplitude counterparts ri > 0, and 0 i < 2 are uniquely determined by the initial conditions. As in
(8.80), we need to specify both the initial positions and initial velocities of all the masses;
this requires a total of 2 n initial conditions
u(t0 ) = a,
3/7/03

308

(8.88)

u(t0 ) = b.
c 2003

Peter J. Olver

m1

m2

Figure 8.7.

Motion of a Double Mass/Spring Chain with Fixed Supports.

The individual solutions


(8.86) are known as the normal modes of vibration of our
p
i the normal frequencies, which are the square roots of the
system, and the i =
eigenvalues of the stiffness matrix . Each summand is a simply periodic, vector-valued
function of period Pi = 2 /i . A linear combination of several such periodic functions
is, in general, called quasi-periodic. Unless the frequencies are all integer multiples of a
common frequency, such a quasi-periodic solution will never precisely repeat itself, and
so can appear to be somewhat chaotic, even though it is built up from a few very simple
periodic functions. The reader will find it very instructive to graph a simple quasiperiodic
function, say

f (t) = c1 cos t + c2 cos 5 t


for various values of c1 , c2 . Comparison with a case where the frequencies are all rational,
say
f (t) = c1 cos t + c2 cos 37 t
is also instructive. The former is truly quasiperiodic, while the the latter is, in fact, periodic
with period 6 . See Figure 8.6 for two sample graphs.
.
Example 8.48. Consider the case of two equal unit masses connected in a row to
two supports by three springs, as in the illustration in Figure 8.7. If the spring constants
are c1 , c2 , c3 (from top to bottom), then the stiffness matrix is

c1 0 0

1
0
1 1 0
c
+
c

c
1
2
2
K=
0 c2 0 1 1 =
0 1 1
c2
c2 + c 3
0 0 c3
0 1
The eigenvalues and eigenvectors of K will prescribe the normal modes of vibration and
natural frequencies of our twomass chain.
Let us look in detail at the case of identical springs, and choose our units so that
2 1
c1 = c2 = c3 = 1. Then K =
has eigenvalues and eigenvectors
1 2


1
1
.
,
2 = 3,
v2 =
1 = 1,
v1 =
1
1
3/7/03

309

c 2003

Peter J. Olver

The general solution to the system is then


1
1
u(t) = r1 cos(t 1 )
+ r2 cos( 3 t 1 )
.
1
1
The first summand is the normal mode vibrating at the relatively slow frequency 1 = 1,
with the two masses moving in tandem. The second summand is the normal mode that
vibrates faster, with frequency 2 = 3, in which the two masses move in opposing
directions. The general motion is a combination of these two normal modes. Since the
two frequencies are irrational multiples of each other, the motion is quasi-periodic. The
system never quite returns to its initial configuration, unless it happens to be vibrating in
only one of the normal modes. A typical solution is plotted in Figure 8.7.
If we eliminate the bottom spring, so the masses arejust hanging
from the top support
1 1
loses its last row and
as in Figure 8.7, then the reduced incidence matrix A =
0 1
becomes square and statically determinate. Assuming that the springs have unit stiffnesses
c1 = c2 = 1, the corresponding stiffness matrix is

1 1
1 0
2 1
T
K=A A=
=
.
0 1
1 1
1 1
The eigenvalues and eigenvectors are
!

3 5
1
,
v1 = 1+5 ,
1 =
2

3+ 5
2 =
,
2

v2 =

1 5
2

The general solution to the system is then


!
!
q
q

1
1
3 5
3+ 5

u(t) = r1 cos
t 1
+ r2 cos
t 1
.
2
2
1+ 5
1 5
2

The slower normal mode, with frequency 1 =

3 5
2

, has the masses moving in tandem,

1+ 5
farther. The faster normal mode,
with the bottom mass moving proportionally
2
q
with frequency 2 = 3+2 5 , has the masses moving in opposite directions, with the top
mass experiencing the larger displacement. Moreover, both modes vibrate slower than
when there is a bottom support. A typical graph of the displacements of the masses is
plotted in Figure 8.8.
Example 8.49. Consider a three mass/spring chain, with unit springs and masses,
and both ends attached to fixed supports. The stiffness matrix

2 1 0
K = 1 2 1 ,
0 1 2
3/7/03

310

c 2003

Peter J. Olver

m1

m2

Figure 8.8.

Motion of a Double Mass/Spring Chain with One Free End.

has eigenvalues and eigenvectors

1 = 2 2 ,

v1 = 2 ,
1

2 = 2,

1
v 2 = 0 ,
1

3 = 2 +

2,

v 3 = 2 .
1

The three normal modes, from slowest to fastest, have frequencies


p

(a) 1 = 2 2 : all three masses move in tandem.

the two outer masses move in opposing directions, while the


(b) 2 = 2 :
middle mass does not move.
p

(c) 3 = 2 + 2 : the two outer masses move in tandem, while the inner mass
moves in the opposite direction.
The general motion is a quasi-periodic combination of these three normal modes. As such,
to the naked eye it can look very complicated. Our mathematical analysis unmasks the
innate simplicity, where the complex dynamics are, in fact, entirely governed by just three
fundamental modes of vibration.
Unstable Structures
So far, we have just dealt with the stable case, when the reduced incidence matrix
has trivial kernel, ker A = {0}, and so the stiffness matrix K = AT CA is positive definite.
Unstable configurations, which can admit rigid motions and/or mechanisms, will provide
additional complications. The simplest version is a single mass that is not attached to
any spring. The mass experiences no restraining force, and has motion governed by the
elementary second order ordinary differential equation
d2 u
= 0.
dt2

The general solution


u(t) = c t + d
3/7/03

311

c 2003

Peter J. Olver

has the mass either sitting still at a specified displacement or moving in a straight line
with constant velocity c 6= 0.
More generally, suppose K = AT C A is only positive semi-definite. Each vector
0 6= v ker A = ker K represents a mode of instability of the system. Since K v = 0, we
can interpret v as a null eigenvector of K, with eigenvalue = 0. Lemma 8.47
gives us
two solutions to the dynamical equations (8.83) with associated frequency = = 0.
The first, u(t) = (cos t) v = v is a constant solution, i.e., an equilibrium configuration of
the system. Thus, an unstable system does not have a unique equilibrium configuration;
every vector v ker A gives a constant solution to the system. On the other hand, the
second solution, u(t) = (sin t)v = 0, is trivial, and of no help for constructing the general
solution. And we still need a second independent solution. In analogy with the scalar case,
we try the solution ansatz u(t) = t v, which works, since
du
= v,
dt

d2 u
= 0 = K u = t K v.
dt2

Therefore, to each element of the kernel of the stiffness matrix i.e., each rigid motion
and mechanism there is a two-dimensional family of solutions
u(t) = (c t + d) v.
When c = 0, we have a constant equilibrium solution; when c 6= 0, the solution is moving
with constant velocity in the null direction v representing an unstable mode in the system.
The general solution will be a linear superposition of the vibrational modes corresponding
to the positive eigenvalues and these unstable linear motions corresponding to the zero
eigenvalues.
Remark : If the null direction v represents a rigid translation, then the entire structure
will move in that direction. If v represents an infinitesimal rotation, then, owing to our
linear approximation to the true nonlinear bar elongations, the individual masses will
move in straight lines, which are tangents approximating the circular motion that occurs
in the true physical, nonlinear regime; see our earlier discussion in Chapter 6. Finally, if
we excite a mechanism, then the masses will again follow straight lines, but in different
directions, whereas in the full nonlinear physical system the masses may move along much
more complicated curved trajectories.
Example 8.50. Consider a system of three unit masses connected in a line by two
unit springs, but not attached to any fixed supports; see Figure 8.9. This structure could
be viewed as a simplified model of a triatomic molecule
that is

only allowed to move the


1 1 0
vertical direction. The incidence matrix is A =
and, since we are dealing
0 1 1
with unit springs, the stiffness matrix is

1 1 0
1 0
1 1 0
= 1 2 1 .
K = AT A = 1 1
0 1 1
0 1 1
0 1
3/7/03

312

c 2003

Peter J. Olver

m1

m2

m3

A Triatomic Molecule.

Figure 8.9.

The eigenvalues and eigenvectors of K are easily found:


1 = 0,

1
v1 = 1 ,
1

2 = 1,

1
v 2 = 0 ,
1

3 = 3 ,

1
v3 = 2 .
1

Each positive eigenvalue gives two trigonometric solutions, while the zero eigenvalue leads
to solutions that depend linearly on t. This yields the six linearly independent solutions,


3t
cos
cos t
1

u5 (t) = 2 cos
u3 (t) = 0 ,
u1 (t) = 1 ,
3t ,
cos 3 t
cos t
1

cos
sin t
t
3t

u6 (t) = 2 cos
u4 (t) = 0 ,
u2 (t) = t ,
3t .
sin t
t
cos 3 t

The first solution u1 (t) is a constant, equilibrium mode, where the masses rest at a fixed
common distance from their reference positions. The second u2 (t) is the unstable mode,
corresponding to a uniform translational motion of the masses without any stretch of the
interconnecting springs. The final four solutions represent vibrational modes. In the first
pair u3 (t), u4 (t), the two outer masses move in opposing directions, while the middle mass
remains fixed, while the final pair u5 (t), u6 (t) has the two outer masses moving in tandem,
while the inner mass moves in the opposite direction. The general solution is a linear
combination of the six normal modes,
u(t) = c1 u1 (t) + + c6 u6 (t),

(8.89)

and corresponds to the molecule moving at a fixed speed while the individual masses
perform a quasi-periodic vibration.
Let us see if we can predict the motion of the molecule from its initial conditions
u(0) = a,
3/7/03

313

u(0) = ,
c 2003

Peter J. Olver

where a = ( a, b, c ) is the initial displacements of the three atoms, while = ( , , ) is


their initial velocities. Substituting the solution formula (8.89) leads to the linear systems
c1 v1 + c3 v2 + c5 v3 = a,

c2 v1 + c4 v2 + c6 v3 = ,

for the coefficients c1 , . . . , c6 . Since the eigenvectors of the symmetric matrix K are mutually orthogonal, we can use our orthogonality formula to immediately compute the coefficients:
a+b+c
a v1
=
,
2
k v1 k
3
v1
++
c2 =
,
=
2
k v1 k
3
c1 =

a v2
ac
=
,
2
k v2 k
2
v2

c4 =
,
=
2
k v2 k
2
c3 =

a v3
a 2b + c
=
,
2
k v3 k
6
v3
2 +
c6 =
.
=
2
k v3 k
6
c5 =

In particular, the unstable translational mode is excited if and only if its coefficient c 2 6= 0
is non-zero, and this occurs if and only if there is a nonzero net initial velocity, ++ 6= 0,
of the molecule. and in this case the molecule will move off to at a uniform velocity
c = c2 = 13 ( + + ) equal to the average of the individual initial velocities. On the
other hand, if + + = 0, then the unstable mode
will not be excited and the molecule
will quasiperiodically vibrate, with frequencies 1 and 3, while sitting at a fixed location.
The observations established in this example hold, in fact, in complete generality. Let
us state the result, leaving the details of the proof as an exercise for the reader.
Theorem 8.51. The solution to unstable second order linear system with positive
semi-definite coefficient matrix K = AT C A is a combination of a quasi-periodic vibration
and a uniform motion at a fixed velocity in the direction of a null eigenvector v ker A.
In particular, the system does not experience any unstable motion, and so will just vibrate

around a fixed position, if and only if the initial velocity u(t0 ) (ker A) = corng A is
orthogonal to the subspace ker A = ker K of all unstable directions.
As usual, the unstable modes correspond to either translations or rotations, or to
mechanisms of the structure. To prevent a system from exhibiting one of these unstable
motions, one has to ensure that the initial velocity is orthogonal to all of them. The result
is in direct analogy with Theorem 6.8 that requires a force to be orthogonal to all such
unstable modes in order to maintian equilibrium in the structure.
Example 8.52. For example, consider the triagular space station of Section 6.3.
Systems with Different Masses
When a structure has differing masses at the nodes, the Newtonian equations of motion
take the more general form (8.77), which we can rewrite in matrix notation as

u = M 1 K u = P u.
The mass matrix M > 0 is positive definite (and, usually, diagonal, although the general
theory does not require this latter restriction), while the stiffness matrix K = A T C A is
3/7/03

314

c 2003

Peter J. Olver

either positive definite or, in the unstable situation ker A 6= {0}, positive semi-definite.
The resulting coefficient matrix
P = M 1 K = M 1 AT C A

(8.90)

is not in general a symmetric matrix, and so we cannot directly apply the preceding
constructions. However, P does have the more general self-adjoint form (7.53) based on
the weighted inner products
e i = uT M u
e,
hu;u

e ii = vT C v
e,
hh v ; v

(8.91)

on the domain and target spaces for A respectively.


If ker A = {0}, then P > 0 is positive definite in the generalized sense of Definition 7.50. In this case, substituting our trigonometric solution ansatz u(t) = (cos t)v
into the system results in a generalized matrix eigenvalue problem
K v = M v,

or, equivalently,

P v = v,

with

= 2.

(8.92)

The matrix M plays the role of the identity matrix I in the standard eigenvalue equation
(8.16). The proofs for the standard eigenvalue problem are easily modified to handle this
situation, and demonstrate that all the eigenvalues are real and non-negative. Moreover
ei
the eigenvectors are orthogonal, but now with respect to the weighted inner product h u ; u
governed by the mass matrix M . Details are relegated to the exercises.
Friction and Damping

So far, we have not allowed frictional forces to affect the motion of our dynamical
equations. In many physical systems, friction exerts a force on a mass in motion which is
proportional to its velocity. In the simplest case of a single mass attached to a spring, one
amends the balance of forces in the undamped Newton equation (8.78) to be
m

du
d2 u
+

+ k u = 0.
dt2
dt

(8.93)

As before, m > 0 is the mass, and k > 0 the spring system, while > 0 measures the effect
of a velocity-dependent frictional force the larger the greater the frictional damping
of the motion.
The solution of this more general second order homogeneous linear ordinary differential
equation is found by substituting an exponential ansatz u(t) = e t into the equation,
leading to the quadratic characteristic equation
m 2 + + k = 0.

(8.94)

There are three possible cases, illustrated in Figure 8.10:


Underdamped : If 2 < 4 m k, then (8.94) has two complex-conjugate roots
p
4 m k 2

=
i
= i .
2m
2m
3/7/03

315

c 2003

(8.95)

Peter J. Olver

0.75

0.75

0.75

0.5

0.5

0.5

0.25

0.25

0.25

-0.25

-0.25

-0.25

-0.5

-0.5

-0.5

-0.75

-0.75

-0.75

-1

-1

-1

Underdamped

Critically Damped
Figure 8.10.

Overdamped

Damped Vibrations.

The general solution to the differential equation is

u(t) = e t c1 cos t + c2 sin t = r e t cos( t ).

(8.96)

This solution represents a damped periodic motion. The amplitude of vibration A(t) =
r e t depends upon the time, and decays to zero at an exponential rate as t . the
rate of decay, = /(2 m), tells us that more friction or less mass will cause the system to
return to equilibrium faster. (Of course, mathematically, it never quite gets there, but in
a real physical system the difference is not noticeable.) On the other hand, the frequency
of vibration,
r
p
4 m k 2
k
2
=
=

,
(8.97)
2m
m 4 m2

remains fixed throughout the motion. The frictionally modified


vibrational frequency
p
(8.97) is strictly smaller than the undamped frequency = k/m, and hence friction has
the effect of slowing down vibrations by a fixed amount while simultaneously diminishing

their amplitudes in time. As the friction approaches a critical threshold, 2 m k, the


vibrational frequency goes to zero, 0, and so the period of vibration P = 2 / goes
to .

Overdamped : If 2 > 4 m k, then (8.94) has two negative real roots


p
p
+ 2 4 m k
2 4 m k
1 =
,
2 =
,
2m
2m
with 1 < 2 < 0. The solution is a linear combination of two decaying exponentials
u(t) = c1 e1 t + c2 e2 t .

The system describes the motion of a mass in a vat of molasses. Its vibration is so slow
that it can pass at most once through its equilibrium position u = 0, and then only when
its initial velocity is quite large. In the long term, since 1 < 2 , the first exponential e1 t
will decay to zero faster, and hence the overall decay rate of the solution is (unless c 2 = 0)
governed by the less negative eigenvalue 2 .
3/7/03

316

c 2003

Peter J. Olver

Critically Damped : The borderline case occurs when 2 = 4 m k, which means (8.94)
has only a single negative real root
1 =

.
2m

In this case, our ansatz only supplies one exponential solution e 1 t = e t/2 m ; The second
linearly independent solution is obtained by multiplication by t, leading to the general
solution
u(t) = (c1 t + c2 )e t/2 m .
Even though the formula looks quite different from the overdamped case, the solution acts
in a very similar fashion. The factor of t plays an unimportant role, since the asymptotics
of this solution are entirely governed by the decaying exponential function. This is the
nonvibrating solution that has the slowest possible decay rate relaxing the frictional
coefficient any further will permit a damped periodic vibration to appear.
In all three cases, provided the frictional coefficient is positive, > 0, the zero solution
is globally asymptotically stable. Physically, since there is no external forcing, all solutions
eventually return to zero as the friction gradually overwhelms any initial motion.
Remark : We may convert the second order equation (8.93) into a first order system by

coefficient matrixof the equivalent


adopting the phase plane variables u and v = u. The

0
1
T
phase plane system u = A u, with u = ( u, v ) , is A =
. In terms of our
c/m b/m
classification of two-dimensional systems, the undamped case corresponds to a center, the
underdamped case to a stable focus, the critically damped case to a stable improper node,
and the overdamped case to a stable node. The reader should verify that the relevant
conditions are met in each case and correlate the phase portraits with the time plots in
Figure 8.10.
This concludes our discussion of the scalar case. Similar considerations apply to
mass/spring chains, and two and three-dimensional structures. The frictionally damped
system is
du
d2 u
+ K u = 0,
(8.98)
M 2 +B
dt
dt
where the mass matrix M > 0 and the matrix of frictional coefficients B > 0 are both
diagonal, positive definite, while the stiffness matrix K = AT C A 0 is a positive semidefinite Gram matrix contructed from the reduced incidence matrix A, which is positive
definite if and only if the structure is stable or the circuit is grounded. The mathematical
details in this case are sufficiently complicated that we shall leave their analysis as an
advanced project for the motivated student.

8.9. Forcing and Resonance.


So far, we have allowed our system to vibrate on its own. It is now time to start
applying external forces seeing what happens if we shake it. In this section, we will
investigate the effects of periodic forcing on both undamped and damped systems. More
3/7/03

317

c 2003

Peter J. Olver

general types of forcing can be handled by a general variation of parameters method,


cf. [20].
The simplest case is that of a single mass connected to a spring without any frictional
damping. We append an external forcing function f (t) to the homogeneous (unforced)
equation (8.78), leading to the inhomogeneous ordinary differential equation
d2 u
+ k u = f (t),
(8.99)
dt2
where m > 0 is the mass and k > 0 the spring stiffness. We are particularly interested in
the case of periodic forcing
f (t) = cos t
(8.100)
m

of frequency > 0 and amplitude . To find a particular solution to (8.99), (8.100), we


use the method of undetermined coefficients which tells us to guess a solution ansatz of
the form
u? (t) = a cos t + b sin t,
(8.101)
where a, b are constant. Substituting into the differential equation, we find
d 2 u?
+ k u? = a(k m 2 ) cos t + b(k m 2 ) sin t = cos t.
dt2
We can solve for

a=
=
,
b = 0,
2
2
k m
m( 2 )
m

provided the denominator is nonzero:


p

k m 2 = m( 2 2 ) 6= 0.

(8.102)

Here = k/m refers to the natural, unforced vibrational frequency of the system, while
is the forcing frequency. Therefore, provided the forcing frequency is not equal to the
systems natural frequency, 6= , there exists a particular solution
u? (t) = a cos t =

cos t
2 )

(8.103)

m( 2

that vibrates at the same frequency as the forcing function.


The general solution to the inhomogeneous system (8.99) is found, as usual, by adding
in an arbitrary solution (8.79) to the homogeneous equation, yielding
u(t) = r cos( t ) + a cos t,

where

a=

,
m( 2 2 )

(8.104)

and where r and are determined by the initial conditions. The solution is therefore a
quasiperiodic combination of two periodic motions the first, vibrating with frequency ,

One can also use variation of parameters, although the intervening calculations are more
slightly complicated.

3/7/03

318

c 2003

Peter J. Olver

2
1
5

10

15

20

-1
-2

Figure 8.11.

Beats in a Periodically Forced Vibration.

represents the internal or natural vibrations of the system, while the second, with frequency
, represents the response of the system to the periodic forcing. Due to the factor 2 2
in the denominator of (8.104), the closer the forcing frequency is to the natural frequency,
the larger the overall amplitude of the response.
Suppose we start the mass initially at equilibrium, so the initial conditions are
u(0) = 0,

(8.105)

u(0) = 0.

Substituting the solution formula (8.104) into (8.105) we find that


r = a,

= 0.

Thus, the solution to the initial value problem can be written in the form

2
+

u(t) = a cos t cos t =


t sin
t , (8.106)
sin
m( 2 2 )
2
2

using a standard trigonometric identity, cf. Exercise . The factor sin 12 ( + )t represents
a periodic motion whose frequency is their average of the natural and forcing frequencies.
If the forcing frequency is close to the natural frequency , then the initial factor A(t) =
2 a sin 21 ( )t can be viewed as a periodically varying amplitude, whose vibrational
frequency 12 ( ) is much slower. This factor is responsible for the phenomenon of beats,
heard, for example, when two tuning forks of close but unequal pitch vibrate near each
other. When the amplitude is small, the sound disappears, periodically reappearing with
considerable strength. In the graph of the function
cos 14 t cos 15.6 t = 2 sin .8 t sin 14.8 t
in Figure 8.11, one sees the slowly varying amplitude A(t) = 2 sin .8 t, with frequency .8,
as an envelope of the relatively rapid vibrations with frequency 14.8.
If we force at exactly the natural frequency = , then the trigonometric ansatz
(8.101) does not work. This is because both terms are now solutions to the homogeneous
equation, and so cannot be combined to form a solution to the inhomogeneous version.
In this situation, there is a simple modification to the ansatz, namely multiplication by t,
that does the trick. Substituting
u? (t) = a t cos t + b t sin t

(8.107)

into the differential equation (8.99), we find


m
3/7/03

d 2 u?
+ c u? = 2 a sin t + 2 b cos t = cos t,
dt2
319

c 2003

Peter J. Olver

40

20

10

20

30

40

-20

-40

Figure 8.12.

Resonance.

and so
a = 0,

b=

.
2m

Combining the resulting particular solution with the solution to thye homogeneous equation leads to the general solution
u(t) = r cos( t ) +

t sin t.
2m

(8.108)

Both terms vibrate with frequency , but the second term has amplitude going to as
t . The system will oscillate more and more wildly until the spring eventually breaks;
see Figure 8.12. In this case, the system is said to be in resonance, and the wild oscillations
are provoked by forcing it at the resonant frequency .
If we are very close to resonance, the oscillations induced by the particular solution
(8.106) will have extremely large, although not unbounded, amplitude a. The lesson is,
never force a system at or close to its natural frequency (or frequencies) of vibration. The
classic example is the Takoma Narrows Bridge disaster, when the vibration in the bridge
caused by a strong wind was close enough to the bridges natural frequency to cause it
to oscillate wildly and collapse! A second example is the practice of the British (and
subsequently, U.S.) infantry who, learning from experience, do not march in unison across
a bridge so as not to set off a resonant frequency and cause the bridge to collapse.
If we include frictional effects, then we can partially mollify the wild behavior near
the resonant frequency. The frictionally damped vibrations of a mass on a spring, when
subject to periodic forcing, can be described the the inhomogeneous version
m

du
d2 u
+
+ k u = cos t
2
dt
dt

(8.109)

of equation (8.93). Let us assume that the friction is not too large, so we are in the
underdamped regime 2 < 4 m k. Since neither summand solves the homogeneous system,
we can use the trigonometric solution ansatz (8.101) to construct the particular solution
u? (t) = A cos( t )
3/7/03

where
320

A= p

m2 ( 2 2 )2 + 2 2
c 2003

(8.110)

Peter J. Olver

represents the amplitude of the response to the periodic forcing, while


= tan1

m( 2 2 )

(8.111)

represents a phase lag in the response of the system due to friction.


The general solution is
u(t) = r e t cos( t ) + A cos( t ),

(8.112)

where r, are determined by the initial conditions, while = + i are the roots of
the characteristic equation, cf. (8.95). The first term the solution to the homogeneous
equation is called the transient since it decays exponentially fast to zero as a result of
the friction in the system. Thus, at large times, the internal motion of the system that
might have been excited by the initial conditions dies out, and only the particular solution
(8.110) incited by the forcing persists. The amplitude of the persistent response (8.110) is
at a maximum at the resonant frequency = , where it takes the value Amax = /( ).
Thus, the smaller the frictional coefficient (or the slower the resonant frequency ) the
more likely the breakdown of the system due to an overly large response.
The friction also induces a phase shift in the response of the system to the external
forcing. Speeding up the forcing frequency increases the overall phase shift, which has
the value of 12 at the resonant frequency = , so the system lags a quarter period
behind the forcing, and reaches a maximum = as . Thus, the response of the
system to a high freqency forcing is almost exactly out of phase the mass is moving
downwards when the force is pulling it upwards, and vice versa!
Electrical Circuits
We already saw how the equations governing the equilibria of simple electrical circuits have the same basic mathematical structure as those of mechanical systems such as
mass/spring chains and structures. In a similar manner, circuits with time-varying currents can also be modeled by linear dynamical systems of ordinary differential equations.
In this section, we analyze the simplest situation of a circuit consisting of a resistor , an
inductor and a capacitor connected together in a loop as illustrated in Figure RLC . This
configuration is known as an R L C circuit after the standard electrical symbols for its
three constituents.
Let u(t) denote the current in the circuit at time t. As the current passes through
each circuit element, it induces a corresponding voltage, which we denote by v R , vL and
vC . These are prescribed by the basic laws of electrical circuit design.
(a) First, as we know from Section 6.2, the resistance R 0 in the circuit is the proportionality factor between voltage and current, so vR = R u.
(b) The voltage passing through an inductor is proportional to the rate of change in the

current. Thus, vL = L u, where L > 0 is the inductance, and the dot indicates
time derivative.
(c) On the other hand, the current passing through a capacitor is proportional to the rate

of change in the voltage, and so u = C v C , where C > 0 denotes theR capacitance.


Wwe integrate this relation to produce the capacitor voltage by vC = (u(t)/C) dt.
3/7/03

321

c 2003

Peter J. Olver

The combination of all the induced voltages must equal the externally applied voltage
from, say, a battery. The precise rules governing these voltages are:
The voltage balance law tells us that the total of these individual voltages must equal
any externally applied voltage. Therefore,
vR + v L + v C = v E ,
where vE = f (t) denotes the applied voltage due to a time-varying external current source.
Substituting the preceding formulae for the voltages, we deduce that the current u(t) in
our circuit satisfies following linear integro-differential equation
Z
u
du
+Ru+
dt = f (t).
(8.113)
L
dt
C
We can convert this into a differential equation by differentiating both sides with respect to
t. Assuming L, R, C are constant, the result is the linear second order ordinary differential
equation
d2 u
du
1
L 2 +R
+ u = f 0 (t).
(8.114)
dt
dt
C
In particular, the homogeneous version governs the current in an R L C circuit with a
constant applied voltage source, f 0 (t) 0. On the other hand, an alternating current
source will produce a time-varying voltage
f (t) = f0 + a sin t.
Comparing (8.114) with the equation (8.93) for a mechanically vibrating mass, we
see that the electrical-mechanical analogy developed in Chapter 6 continues to hold in the
dynamical regime. The current corresponds to the displacement. The inductance plays the
role of mass, the resistance the role of friction, and the reciprocal 1/C of capacitance the
role of the spring stiffness. Thus, all the conclusions on stability, behavior and formulae
for solutions, etc., that we already established in the mechanical context can, suitably
re-interpreted, be immediately applied to electrical circuit theory.
In particular, the R L C circuit is underdamped if R2 < 4 L/C, and the current u(t)
oscillates with frequency
r
1
R2

,
(8.115)
=
C L 4 L2
while slowly dying off to zero. In the overdamped and critically damped cases R 2 4 L/C,
where the resistance of the circuit is large, the current merely decays to zero exponentially
fast and there is no longer any oscillatory behavior in the circuit.
Forcing and Resonance in Systems
Let us very briefly discuss the effect of periodic forcing on a more complicated system.
For undamped mass/spring chains, structures and more complicated L C circuits, we are
led to consider a periodically forced second order system

M u + K u = cos t a,
3/7/03

322

(8.116)
c 2003

Peter J. Olver

where a is a constant vector representing both a magnitude and a direction of the


forcing. Here M > 0 is the diagonal mass matrix (or inductance matrix in the circuit
case), while K = AT C A the (semi-)definite stiffness (or conductance) matrix for the
system. We are ignoring friction (resistance) in this case. We only need consider the
effect of one such forcing since we can employ the general inhomogeneous superposition
principle of Theorem 7.33 to determine the effect of linear combinations of forcings at
different frequencies and in different directions.
To find a particular solution to the system, let us try a particular solution of the form
u? (t) = (cos t)w

(8.117)

for some constant vector w. Substituting into (8.116), we are led to a linear system
(K M ) w = a,

where

= 2.

(8.118)

If equation (8.118) has a solution, then our ansatz (8.117) is valid, and we have produced
a particular vibration of the system with the same frequency as the forcing vibration. The
general solution, then, will be a quasi-periodic combination of this particular vibration at
the forcing frequency combined with the vibrations at the systems natural, unforced
frequencies. In particular, if = 2 is not a generalized eigenvalue of the matrix pair
K, M , as in (8.92), then the coefficient matrix K M is nonsingular, and so (8.118) can
be solved for any right hand side a.
The more interesting case is when K M is singular, its kernel being equal to the
generalized eigenspace V . In this case, (8.118) will have a solution w if and only if a
lies in the range of K M . According to the Fredholm Alternative Theorem 5.45, the
range is the orthogonal complement of the cokernel, which, since the coefficient matrix
is symmetric, is the same as the kernel see Exercise . Therefore, (8.118) will have a
solution if and only if a is orthogonal to V , i.e., a v = 0 for every eigenvector v for
the eigenvalue . Thus, one can force a system at a natural frequency without inciting
resonance provided the direction of forcing, as governed by the vector a, is orthogonal
(with respect to the inner product induced by the mass matrix M ) to the natural directions
of motion of the system, as governed by the eigenvectors for that particular frequency.
If this orthogonality constraint is not satisfied, then the periodic solution ansatz does
not apply, and we are in a truly resonant situation. Inspired by the scalar solution, let us
try the resonant ansatz
u? (t) = (t sin t)y + (cos t)w.
(8.119)
We compute

d 2 u?
= ( 2 t sin t)y + (cos t)(2 y 2 w).
dt2
Therefore (8.119) will solve (8.116) provided
(K M )y = 0,

(K M )w = a 2 y.

The first equation requires that y be a generalized eigenvector of the matrix pair K, M .
The second will be solvable for w if and only if a 2 y is orthogonal to the eigenspace
V . This will be true if and only if 2 y is the orthogonal projection of a onto V . With
3/7/03

323

c 2003

Peter J. Olver

this choice of y and w, the basic ansatz works and produces a resonant solution to the
system.
Summarizing:
Theorem 8.53. An undamped vibrational system will be periodically forced into
resonance if and only if the forcing f = cos t a is at a natural frequency of the system
and the direction of forcing a is not orthogonal to the natural direction of motion of the
system for that frequency.

3/7/03

324

c 2003

Peter J. Olver

Chapter 9
Iteration of Linear Systems
Iteration, or repeated application of a function, appears in a suprisingly wide range
of applications. Discrete dynamical systems, in which the continuous time variable has
been quantized in discrete units (seconds, days, years, etc.) are represented by iterative
systems. Most numerical solution methods, for both linear and nonlinear equations, are
based on an iterative procedure. Starting with an initial guess, the successive iterates lead
to closer and closer approximations to the true solution. For linear systems of equations,
iterative solution methods can be used as an attractive alternative to Gaussian elimination,
and one that works particularly well for the very large, sparse systems arising in the numerical solution to both ordinary and partial differential equations. In probability theory,
population dynamics and other applications, iterative models known as Markov processes
govern basic probabilistic processes. All practical methods for computing eigenvalues and
eigenvectors are based on an iterative characterization, and completely avoid solving the
characteristic equation.
In this chapter, we concentrate on the iteration of linear systems. As always, proper
understanding of the linear situation is an essential prerequisite for tackling the far more
complicated nonlinear realm, which will be deferred until Chapter 18. Linear iteration
coincides with multiplication by successive powers of a matrix. The convergence of the
iterates depends on the eigenvalues of the coefficient matrix. The largest eigenvalue (in
modulus) is known as the spectral radius of the matrix, and convergence requires a
spectral radius smaller than 1. While exact computation of the eigenvalues is typically
a difficult issue, the simple but effective Gerschgorin Circle Theorem can be used to give
useful estimates, and thereby ensure convergence. Several applications, including Markov
processes and Fibonacci sequences, will be presented. As a practical alternative to convergence criteria based on the spectral radius, we introduce the notion of the norm of a
matrix, which can be more easily computed from the matrix entries. Iterative methods
with coefficient matrices of norm less than one are guaranteed to converge.
In the following section, we present the three most important iterative schemes for
solving linear systems of algebraic equations. The classical Jacobi method is the simplest,
while an evident modification leads to the popular GaussSeidel method. Completely
general conditions ensuring convergence of these schemes to the solution of the original
system are hard to formulate, although convergence is assured for the important class of
diagonally dominant matrices that arise in many applications. A simple modification of
the GaussSeidel scheme known as Successive Over-Relaxation (SOR) has dramatic effects
on the speed of convergence, and is the method of choice in many modern applications.
In the final section we discuss the computation of eigenvalues and eigenvectors of
3/7/03

324

c 2003

Peter J. Olver

matrices. Needless to say, we completely avoid trying to solve (or even compute) the
characteristic polynomial equation. A simple power method based on linear iteration is
an effective means of finding the largest eigenvalue. Deflation techniques can then be
employed to compute the next few eigenvalues. In the case of positive definite matrices,
the remarkable Q R algorithm and its variants are the methods of choice. The algorithm
is based of the GramSchmidt orthogonalization procedure, inspired by the orthogonality
of the eigenvectors.

9.1. Linear Iterative Systems.


We begin with a basic study of iterative systems of linear equations.
Definition 9.1. A linear iterative system takes the form
u(k+1) = T u(k) ,

u(0) = a.

(9.1)

The coefficient matrix T has size n n. We will consider both real and complex
systems, and so the iterates u(k) are vectors either in R n (which assumes that the coefficient
matrix T is also real) or in C n . A linear iterative system can be viewed as a discretized
version of a first order system of linear ordinary differential equations, as in (8.2), in which
the state of system, as represented by the vector u(k) , changes at discrete time intervals,
labelled by the index k, which takes non-negative integer values. The solution is uniquely
determined by the initial conditions u(0) = a.
Scalar Systems
As usual, one begins with an analysis of the scalar version. Consider the iterative
equation
(9.2)
u(k+1) = u(k) ,
u(0) = a.
The general solution to (9.2) is easily found:
u(1) = u(0) = a,

u(2) = u(1) = 2 a,

u(3) = u(2) = 3 a,

and, in general,
u(k) = k a.

(9.3)

If the initial condition is a = 0, then u(k) 0. Therefore, 0 is a fixed point or equilibrium


solution for the iterative system.
Let us first analyze the case when R is a real constant. Apart from the equilibrium
solution, the iterates experience five qualitatively different behaviors, depending on the size
of the coefficient .
(a) If = 0, the solution immediately becomes zero, and stays there, so u (k) = 0 for
all k 1.
(b) If 0 < < 1, then the solution is of one sign, and tends monotonically to zero, so
u(k) 0 as k .
(c) If 1 < < 0, then the solution tends to zero, u(k) 0 as k . Successive
iterates have alternating signs.
3/7/03

325

c 2003

Peter J. Olver

0.75

0.75

0.75

0.5

0.5

0.5

0.25

0.25

0.25

10

15

20

25

30

-0.25

10

15

20

25

30

-0.25

-0.5

-0.5

-0.5

-0.75

-0.75

-0.75

-1

-1

-1

0<<1

1 < < 0

1
0.75

7.5

0.5

0.25

2.5

2.5

10

15

20

25

10

-2.5

15

20

25

30

30

10

15

20

25

30

-2.5

-0.5

-5

-5

-7.5

-7.5

-1

-10

-10

Figure 9.1.

25

10

-0.75

= 1

20

7.5

30

15

=1

10

-0.25

10

-0.25

1<

< 1

One Dimensional Real Linear Iterative Systems..

(d) If = 1, the solution is constant, u(k) = c, for all n 0.


(e) If = 1, the solution switches back and forth between two values; u (k) = (1)k a.
(f ) If 1 < < , then the iterates u(k) become unbounded. If a > 0, they go
monotonically to + ; if a < 0, to .
(g) If < < 1, then the iterates u(k) also become unbounded. In this case,
successive iterates have alternating signs.
In Figure 9.1 we exhibit representative scatter plots for the nontrivial cases (b g).
To describe the different scenarios, we adopt a terminology that already appeared in
the continuous realm. In the first three cases, the fixed point u = 0 is said to be globally
asymptotically stable since all solutions tend to 0 as k . In cases (d) and (e), the
zero solution is stable, since solutions with nearby initial data, | a | 1, remain nearby.
In the final two cases, the zero solution is unstable; any nonzero initial data a 6= 0 no
matter how small will give rise to a solution that eventually goes arbitrarily far away
from equilibrium.
Let us next consider the case of a complex scalar iterative system. The coefficient
and the initial data a in (9.2) are allowed to be complex numbers. The solution is the
same, (9.3), but now we need to know what happens when we raise a complex number
to a high power. The secret is to write = r e i in polar form, where r = | | is its
modulus and = ph its angle or phase. Then k = rk e i k . Since | e i k | = 1, we have
| k | = | |k , and so the solutions (9.3) have modulus | u(k) | = | k a | = | |k | a |. As a
result, u(k) will remain bounded if and only if | | 1, and will tend to zero as k if
and only if | | < 1.
We have thus established the basic stability criteria for scalar, linear systems.
u

Theorem 9.2. The zero solution to a (real or complex) scalar iterative system
= u(k) is
(a) asymptotically stable if and only if | | < 1,

(k+1)

3/7/03

326

c 2003

Peter J. Olver

(b) stable if and only if | | 1,


(c) unstable if and only if | | > 1.
Powers of Matrices
The solution to the general linear matrix iterative system
u(k+1) = T u(k) ,

u(0) = a,

(9.4)

is also, at least at first glance, immediate. Clearly,


u(1) = T u(0) = T a,

u(2) = T u(1) = T 2 a,

u(3) = T u(2) = T 3 a,

and, in general,
u(k) = T k c.

(9.5)

Thus, the iterates are simply determined by multiplying the initial vector a by the successive powers of the coefficient matrix T .
However, unlike real or complex scalars, the general formulae and qualitative behavior
of the powers of a square matrix are not nearly so immediately apparent. (Before continuing, the reader is urged to experiment with simple 2 2 matrices, and try to detect
patterns.) To resolve this dilemma, recall that we managed to solve linear systems of
differential equations by suitably adapting the known exponential solution from the scalar
version. In the discrete case, we no longer have exponentials, but rather powers, in our
scalar solution formula (9.3). This motivates us to try the power ansatz
u(k) = k v,

(9.6)

where is a scalar and v is a fixed vector, as a possible solution. We find


u(k+1) = k+1 v,

while

T u(k) = k T v.

These two expressions will be equal if and only if


T v = v.
Therefore, (9.6) is a nontrivial solution to (9.4) if and only if is an eigenvalue of T and v an
associated eigenvector . Thus, to each eigenvector and eigenvalue of the coefficient matrix,
we can construct a solution to the iterative system. We can then use linear superposition,
as in Theorem 7.21, to combine the basic power solutions to form more general solutions.
In particular, if the coefficient matrix is complete, then this method will, as in the case of
linear ordinary differential equations, produce the general solution.
Theorem 9.3. If the coefficient matrix T is complete, then the general solution to
the linear iterative system u(k+1) = T u(k) is given by namely
u(k) = c1 k1 v1 + c2 k2 v2 + + cn k1 vn ,

(9.7)

where v1 , . . . , vn are the linearly independent eigenvectors and 1 , . . . , n the corresponding eigenvalues of T . The coefficients c1 , . . . , cn are arbitrary scalars, and are uniquely
prescribed by the initial conditions u(0) = a.
3/7/03

327

c 2003

Peter J. Olver

Proof : Since we already know that (9.7) is a solution to the system for arbitrary
c1 , . . . , cn , it suffices to prove that we can match any prescribed initial conditions. We
need to solve the linear system
u(0) = c1 v1 + + cn vn = a.

(9.8)

Completeness of T implies that its eigenvectors form a basis of C n , and hence (9.8) always
admits a solution. In matrix form, we can rewrite (9.8) as
S c = a,

c = S 1 a,

so that

where S = ( v1 v2 . . . vn ) is the (nonsingular) matrix whose columns are the eigenvectors.


Q.E.D.
Remark : Incomplete cases require more work, and rely on the Jordan canonical form
of Appendix D.
Example 9.4. Consider the iterative system
x(k+1) =

3
10

x(k) +

y (k) ,

y (k+1) =

x(0) = a,

y (0) = b.

1
10

1
10

x(k) +

3
10

y (k) ,

(9.9)

with initial conditions


(9.10)

The system can be rewritten in our matrix form (9.4) with

(k)

x
.3 .1
(k)
,
,
u =
T =
y (k)
.1 .3


a
.
a=
b

Solving the characteristic equation


det(T I ) = 2 .6 .08 = 0
produces the eigenvalues 1 = .4, 2 = .2. We then solve the associated linear systems
(T j I )vj = 0 for the corresponding eigenvectors:


1
1
.
,
2 = .2 ,
v2 =
1 = .4 ,
v1 =
1
1
Therefore, the basic power solutions are

1
(k)
k
,
u1 = ( .4)
1

(k)
u2

= ( .2)

1
.
1

Theorem 9.3 tells us that the general solution is given as a linear combination,


1
1
c1 ( .4)k + c2 ( .2)k
(k)
(k)
(k)
k
k
,
u = c1 u1 + c2 u2 = c1 ( .4)
+ c2 ( .2)
=
1
1
c1 ( .4)k c2 ( .2)k
where c1 , c2 are arbitrary scalars, whose values are determined by the initial conditions:

a+b
ab
a
c1 + c 2
(0)
,
and hence
c1 =
=
u =
,
c2 =
.
b
c1 c 2
2
2
3/7/03

328

c 2003

Peter J. Olver

Therefore, the explicit formula for the solution to (9.9), (9.10) is


x(k) = (.4)k

ab
a+b
+ (.2)k
,
2
2

y (k) = (.4)k

a+b
ab
(.2)k
.
2
2

In particular, as k , the iterates u(k) 0 converge to zero at a rate governed by the


larger eigenvalue 1 = .4. Thus, (9.9) defines a stable iterative system.
Example 9.5.
scheme

The Fibonacci numbers are defined by the second order iterative


u(k+2) = u(k+1) + u(k) ,

(9.11)

with initial conditions


u(0) = a = 1,

u(1) = b = 1.

(9.12)

Thus, to obtain the next Fibonacci number, we add the previous two; the first few are
u(2) = 2,

u(3) = 3,

u(4) = 5,

u(5) = 8,

u(6) = 13,

....

The Fibonacci numbers occur in a surprising range of natural objects, including leaves,
flowers, and fruit, [F]. They were originally introduced by the Renaissance mathematician
Fibonacci (Leonardo of Pisa) as a crude model of the growth of a population of rabbits.
In Fibonaccis model, the k th Fibonacci number u(k) measures the total number of pairs
of rabbits at year k. We start the process with a single juvenile pair at year 0. Once a
year, each pair of rabbits produces a new pair of offspring, but it takes a year for a rabbit
pair to mature enough to produce offspring of their own.
Just as every higher order ordinary differential equation can be replaced by an equivalent first order system, so every higher order iterative system can be replaced by a first
order iterative system. In this particular case, we define the vector
(k+1)
u
(k)
u =
R2,
u(k)
and note that (9.11) is equivalent to the matrix system
(k+1)
(k+2)
u
u
1 1
, or u(k+1) = T u(k) ,
=
(k+1)
u(k)
1 0
u

where

T =

1 1
1 0

To find an explicit formula for the Fibonacci numbers, we need to determine the eigenvalues
and eigenvectors of the coefficient matrix T . A straightforward computation produces

1+ 5
1 5
1 =
= 1.618034 . . . ,
2 =
= .618034 . . . ,
2
2
!
!
v1 =

1+ 5
2

v2 =

1 5
2

We ignore important details like the sex of the offspring.

3/7/03

329

c 2003

Peter J. Olver

Therefore, according to (9.7), the general solution to the Fibonacci system is


u(k) =

(k+1)

u
u(k)

= c1

!k 15 !
!k 1+5 !
1

5
1+ 5
2
2
+ c2
.
2
2
1
1

The initial data


u(0) = c1

1+ 5
2

+ c2

uniquely specifies the coefficients

2 b (1 5) a

c1 =
,
2 5

1 5
2

(9.13)

!
a
=
b

2 b + (1 +

c2 =
2 5

5) a

The second entry of the solution vector (9.13) produces the formula
u(k)

!k
!k

2 b + (1 + 5) a 1 5
2 b (1 5) a 1 + 5

=
2
2
2 5
2 5

(9.14)

for the k th generalized Fibonacci number. For the particular initial conditions a = b = 1,
formula (9.14) reduces to the classical Binet formula

!k+1
!k+1
1 5
1+ 5
1

(9.15)
u(k) =
2
2
5

for the Fibonacci integers. It is a remarkable fact that, for every value of k, all the 5s
cancel out, and the Binet formula does indeed produce the Fibonacci integers tabulated
above. Another observation is that since

51
1+ 5
0 < | 2 | =
< 1 < 1 =
,
2
2

the terms involving k1 go to (and so the zero solution to this iterative system is unstable)
while the terms involving k2 go to zero. Therefore, even for k moderately large, the first
term in (9.14) is a very good approximation (and one that gets more and more accurate
with

1
th
increasing k) to the k Fibonacci number. The dominant eigenvalue 1 = 2 1 + 5 =
1.618034 . . . is known as the golden ratio and plays an important role in spiral growth in
nature as well as in art, architecture and design, [F]. It describes the overall growth rate
of the Fibonacci integers, and, in fact, any sequence
of generalized Fibonacci numbers as
long as the initial conditions satisfy b 6= 12 1 5 a.

3 1
6
Example 9.6. Let T = 1 1 2 be the coefficient matrix for a three1 1 0
(k+1)
dimensional iterative system u
= T u(k) . The eigenvalues and corresponding eigen3/7/03

330

c 2003

Peter J. Olver

vectors are
1 = 2,

4
v1 = 2 ,
1

2 = 1 + i ,

2 i
v2 = 1 ,
1

3 = 1 i ,

2+ i
v3 = 1 .
1

Therefore, according to (9.7), the general complex solution to the iterative system is

2+ i
2 i
4
u(k) = b1 ( 2)k 2 + b2 ( 1 + i )k 1 + b3 ( 1 i )k 1 ,
1
1
1

where b1 , b2 , b3 are arbitrary complex scalars.


If we are only interested in real solutions, we can, as in the case of systems of differential equations, break up complex solutions into their real and imaginary parts, each of
which constitutes a real solution. (This is another manifestation of the general Reality
3 Theorem 7.38, but is not hard to prove directly.) We begin by writing 2 = 1+ i = 2e i /4 ,
and hence

( 1 + i )k = 2k/2 e3 k i /4 = 2k/2 cos 43 k + i sin 43 k .


Therefore, the complex solution

3
3
3
3
2
cos
k

+
sin
k

2
sin
k

cos
k

2 i
4
4
4
4

k/2
3
3
( 1 + i )k 1 = 2k/2
+
i
2

cos 4 k
sin 4 k
1
cos 3 k
sin 3 k
4

yields two independent real solutions. (The complex conjugate eigenvalue 3 = 1 i


leads, as usual, to the complex conjugate solution and the same two real solutions.)
The general real solution to the system has the ugly explicit formula
u(k) =

2 cos 43 k + sin 34 k
2 sin 43 k cos 34 k
4

k/2
c1 ( 2)k 2 + c2 2k/2
+ c3 2
,
cos 43 k
sin 43 k
1
cos 3 k
sin 3 k

(9.16)
where c1 , c2 , c3 are arbitrary real scalars, uniquely prescribed by the initial conditions.

9.2. Stability.
With the solution formula (9.7) in hand, we are now in a position to understand
the qualitative behavior of solutions to (complete) linear iterative systems. The most
important case for applications is when all the iterates converge to 0.
Definition 9.7. The equilibrium solution u? = 0 to a linear iterative system (9.1)
is called asymptotically stable if and only if all solutions u (k) 0 as k .
3/7/03

331

c 2003

Peter J. Olver

Stability of the solutions to an iterative system relies on the following property of the
coefficient matrix.
Definition 9.8. A matrix T is called convergent if its powers T k O converge to
the zero matrix as k .
The equivalence of the convergence condition and stability of the iterative system
follows immediately from the solution formula (9.5).
Lemma 9.9. The linear iterative system u(k+1) = T u(k) has asymptotically stable
zero solution if and only if T is a convergent matrix.
For the analysis of convergence, we shall adopt a norm k k on our underlying vector
space, R n or C n . The reader may be inclined to choose the Euclidean (or Hermitian)
norm, but, in practice, the L norm

k u k = max | u1 |, . . . , | un |
(9.17)
prescribed by the vectors maximal entry (in modulus) is usually much easier to work with.
Convergence of the iterates is equivalent to convergence of their norms:
u(k) 0

if and only if

k u(k) k 0

as

k .

(See also Section 11.5 for additional details on convergence.)


The fundamental stability criterion is prescribed by the size of the eigenvalues of the
coefficient matrix.
Theorem 9.10. A linear iterative system (9.1) has asymptotically stable zero solution if and only if all its (complex) eigenvalues have modulus strictly less than one:
| j | < 1.
Proof : Let us prove this result assuming that the coefficient matrix T is complete. If
(k)
j is an eigenvalue such that | j | < 1, then the corresponding basis solution uj = kj vj
tends to zero as k ; indeed,
(k)

k uj k = k kj vj k = | j |k k vj k 0

since

| j | < 1.

Therefore, if all eigenvalues are less than 1 in modulus, all terms in the solution formula
(9.7) tend to zero, which proves asymptotic stability: u(k) 0.
Q.E.D.
Remark : While the proofs in this section rely on completeness of the coefficient matrix,
the definitions and results are all stated to also cover incomplete matrices with less than
a full complement of eigenvectors. The proofs in the incomplete case rely on the Jordan
canonical form; details are provided in Appendix D.
Consequently, the necessary and sufficient condition for asymptotic stability of a linear
iterative system is that all the eigenvalues of the coefficient matrix lie strictly inside the
unit circle in the complex plane: | j | < 1. Let us formalize this basic result.
3/7/03

332

c 2003

Peter J. Olver

Definition 9.11. The spectral radius of a matrix T is defined as the maximal modulus of all of its real and complex eigenvalues
(T ) = max { | 1 |, . . . , | k | }.
We can thus restate the Stability Theorem 9.10 as follows.
Theorem 9.12. The matrix T is convergent if and only if its spectral radius is
strictly less than one: (T ) < 1.
If T is complete, then we can apply the triangle inequality to (9.7) to estimate
k u(k) k = k c1 k1 v1 + + cn k1 vn k

| 1 |k k c 1 v1 k + + | n |k k c n vn k

(T )k | c1 | k v1 k + + | cn | k vn k = C (T )k ,

(9.18)

for some constant C > 0 that depends only upon the initial conditions. In particular, if
(T ) < 1, then
k u(k) k C (T )k 0
as
k ,
(9.19)
in accordance with Theorem 9.12. Thus, the spectral radius prescribes the rate of convergence of the solutions to equilibrium. The smaller the spectral radius, the faster the
solutions converge to 0.
If T has only one largest (simple) eigenvalue, so | 1 | > | j | for all j > 1, then the
first term in the solution formula (9.7) will eventually dominate all the others: k k1 v1 k
k kj vj k for j > 1 and k 0 large. Therefore, if c1 6= 0, the solution (9.7) has the
asymptotic formula
(9.20)
u(k) c1 k1 v1 ,
and so most solutions end up parallel to the dominant eigenvector v 1 . In particular, if
| 1 | = (T ) < 1, such a solution approaches 0 along the direction of the dominant eigenvector v1 at a rate governed by the modulus of the dominant eigenvalue. The exceptional
solutions, with c1 = 0, tend to 0 at a faster rate, along one of the other eigendirections.
Remark : The inequality (9.18) only applies to complete matrices. In the general case,
one can prove that the solution satisfies the slightly weaker inequality
k u(k) k C k ,

for all

k 0,

where

> (T )

(9.21)

is any number larger than the spectral radius, while C > 0 is a positive constant (that
may depend on how close is to ).
Example 9.13. According to Example 9.6, the matrix

T =
1
1
3/7/03

1
6
1 2
1 0

has eigenvalues

1 = 2,
2 = 1 + i ,

3 = 1 i .
333

c 2003

Peter J. Olver


Since | 1 | = 2 > | 2 | = | 3 | = 2 , the spectral radius is (T ) = | 1 | = 2. We conclude
that T is not a convergent matrix. As the reader can check, either directly, or from the
solution formula (9.16), the vectors u(k) = T k u(0) obtained by repeatedly multiplying any
nonzero initial vector u(0) by T rapidly go off to .
On the other hand, the matrix

1 = 32 ,
1
13 2

1
2
with eigenvalues
2 = 13 (1 i ),
Te = 13 T = 31

3
3
1
1
3 = 31 (1 + i ),
0
3

has spectral radius (Te) = 32 , and hence is a convergent matrix.

According to (9.20), if we write the initial data u(0) = c1 v1 + c2 v2 + c3 v3 as a


linear combination of the eigenvectors, and c1 6= 0, then the iterates have the asymptotic

k
T
form u(k) c1 32
v1 , where v1 = ( 4, 2, 1 ) is the eigenvector corresponding
to the dominant eigenvalue 1 = 32 . Thus, for most initial vectors, the iterates end
up decreasing in length by a factor of almost exactly 32 and eventually parallel to the
T
dominant eigenvector ( 4, 2, 1 ) . This is borne out by a sample computation; starting
T
with u(0) = ( 1, 1, 1 ) , the successive iterates are

0.0936
0.0627
0.0416
0.0275
0.0182
0.0462 ,
0.0312 ,
0.0208 ,
0.0138 ,
0.0091 ,
0.0231
0.0158
0.0105
0.0069
0.0046

0.0121
0.0081
0.0054
0.0036
0.0024
0.0061 ,
0.0040 ,
0.0027 ,
0.0018 ,
0.0012 ,
0.0030
0.0020
0.0013
0.0009
0.0006

and so on.

Fixed Points
The zero vector 0 is always a fixed point for a linear iterative system u (k+1) = T u(k) .
Are there any others? The answer is immediate: u? is a fixed point if and only if
u? = T u ? ,
and hence any nonzero u? must be an eigenvector of T with eigenvalue 1. Thus, the system
has a nonzero fixed point if and only if the coefficient matrix T has 1 as an eigenvalue.
Since any scalar multiple of the eigenvector u? is also an eigenvector, in such cases the
system admits infinitely many fixed points.
The stability of such fixed points, at least if the coefficient matrix is complete, is
governed by the same solution formula (9.7). If the eigenvalue 1 = 1 is simple, and all
other eigenvalues are less than one in modulus, | j | < 1, j 2, then the solution takes
the asymptotic form
u(k) = c1 v1 + c2 k2 v2 + + cn k1 vn c1 v1 ,
3/7/03

334

as

k ,
c 2003

(9.22)

Peter J. Olver

converging to one of the fixed points, i.e., a multiple of the eigenvector v 1 . The actual
multiple c1 is determined by the initial conditions, as in (9.8). The rate of convergence is
governed by the next largest eigenvalue modulus | 2 |.

The general convergence result governing the stability of fixed points for general coefficient matrices follows. Again, the proof in the incomplete case can be found in Appendix D.

Theorem 9.14. Suppose that T has a simple (or, more generally, complete) eigenvalue 1 = 1, and, moreover, all other eigenvalues satisfy | j | < 1, for j 2. Then
all solutions to the linear iterative system u(k+1) = T u(k) converge to an eigenvector
associated to the eigenvalue 1 = 1.

3
2

Example 9.15. For the matrix T = 21


1
2

sponding eigenvectors are


1 = 1,

4
v1 = 2 ,
1

12
1
2
1
2

1+ i
,
2

2 i
v2 = 1 ,
1

2 =

1 , the eigenvalues and corre0

1 i
,
2

2+ i
v3 = 1 .
1
3 =

Since 1 = 1, any multiple of the eigenvector v1 is a fixed point. The


fixed points are
stable since the remaining eigenvalues have modulus | 2 | = | 3 | = 12 2 0.7071 < 1.
Thus, the iterates u(k) = T k a c1 v1 will eventually converge, at a rate of about .7, to a
T
multiple of the first eigenvector. For example, starting with u(0) = ( 1, 1, 1 ) , leads to the
iterates (we only display every fifth one)

9.5
u(5) = 4.75 ,
2.75

8.0088
u(20) = 4.0029 ,
2.0029

u(10)

u(25)

7.9062
= 3.9062 ,
1.9062

7.9985
= 3.9993 ,
1.9993

u(15)

u(30)

7.9766
= 4.0 ,
2.0

8.0001
= 4.0001 ,
2.0001
T

which are slowly converging to the particular eigenvector ( 8, 4, 2 ) = 2 v1 . This can


be predicted in advance by decomposing the initial condition into a linear combination of
eigenvectors:

u(0)

3/7/03


2 i
2+ i
4
1
3 3i
3 + 3i
1 +
1 ,
= 1 = 2 2 +
2
2
1
1
1
1
335

c 2003

Peter J. Olver

whence
u(k)

k 2 i

k 2 + i
4
3 + 3i 1 + i
1 + 3 3 i 1 i
1
= 2 2 +
2
2
2
2
1
1
1

4
2 2
as
k .
1

9.3. Matrix Norms.


The convergence of a linear iterative system is governed by the spectral radius of
the coefficient matrix, and hence knowledge of its eigenvalues is essential. Unfortunately,
a priori information on the eigenvalues is not so easy to come by. Indeed, computing
accurate approximations to the eigenvalues of a general matrix is a difficult computational
problem, and completely satisfactory general numerical algorithms are not known. We
will discuss numerical methods for computing eigenvalues in Section 9.6. The best way to
compute the spectral radius is, in fact, to explicitly iterate the matrix and observe how
fast the resulting vectors grow or decay. But this defeats its purpose!
An alternative, more practical approach to convergence is based on the concept of a
matrix norm. Matrix norms are a natural class of norms on the vector space of n n
matrices. They often provide comparable a priori convergence information for the linear
iterative systems.
We work with real nn matrices in this section, although the results straightforwardly
extend to complex nn matrices. Let us fix a norm k k on R n . The norm may or may not
come from an inner product this is irrelevant as far as the construction goes. Roughly
speaking, the matrix norm tells us how far the matrix stretches vectors relative to the
given norm.
Theorem 9.16. If k k is any norm on R n , then the quantity
k A k = max { k A u k | k u k = 1 }

(9.23)

defines a norm on the vector space Mnn of all n n matrices, called the natural matrix
norm associated with the given norm.
Proof : First note that k A k < since the maximum is taken on a closed and bounded
subset, namely the unit sphere S1 = { k u k = 1 } of the given norm. To show that (9.23)
defines a norm, we need to verify the three basic axioms of Definition 3.13. Non-negativity,
k A k 0, is immediate. Suppose k A k = 0. This means that k A u k = 0, and hence
A u = 0 for every unit vector k u k = 1. If 0 6= v R n is any nonzero vector, then
u = v/r, where r = k v k, is a unit vector, and so
A v = A(r u) = r A u = 0.
3/7/03

336

(9.24)
c 2003

Peter J. Olver

Therefore, A v = 0 for every v R n , which implies A = O is the zero matrix. This serves
to prove the positivity property. As for homogeneity, if c R is any scalar,
k c A k = max { k c A u k } = max { | c | k A u k } = | c | k A k.
Finally, to prove the triangle inequality, we use the fact that the maximum of the sum
of quantities is bounded by the sum of their individual maxima. Therefore, since the norm
on R n satisfies the triangle inequality,
k A + B k = max { k A u + B u k } max { k A u k + k B u k }
max { k A u k } + max { k B u k } = k A k + k B k.
This completes the proof that the matrix norm satisfies the three basic axioms.

Q.E.D.

The property that distinguishes a matrix norm over a generic norm on the space of
matrices is the fact that it obeys a very useful product inequality.
Theorem 9.17. A natural matrix norm satisfies
k A v k k A k k v k,

for all

A Mnn ,

v Rn.

(9.25)

Furthermore,
k A B k k A k k B k,

for all

A, B Mnn .

(9.26)

Proof : Note first that, by definition k A u k k A k for all unit vectors k u k = 1.


Then, letting v = r u where u is a unit vector and r = k v k, we have
k A v k = k A(r u) k = r k A u k r k A k = k v k k A k,
proving the first inequality. To prove the second, we apply the first to compute
k A B k = max { k A B u k } = max { k A (B u) k }

max { k A k k B u k } = k A k max { k B u k } = k A k k B k.

This completes the proof.

Q.E.D.

The second property implies, in particular, that k A2 k k A k2 ; equality is not necessarily valid. More generally,
Lemma 9.18. If A is a square matrix, then k Ak k k A kk . In particular, if
k A k < 1, then k Ak k 0 as k , and hence A is a convergent matrix: Ak O.
The converse is not quite true; a convergent matrix does not necessarily have matrix
norm less than 1, or even 1 see Example 9.23. An alternative proof of Lemma 9.18
is based on the following useful estimate.
Theorem 9.19. The spectral radius of a matrix is bounded by its matrix norm:
(A) k A k.
3/7/03

337

(9.27)
c 2003

Peter J. Olver

Proof : If is a real eigenvalue, and u a corresponding unit eigenvector, so that


A u = u with k u k = 1, then
k A u k = k u k = | | k u k = | |.

(9.28)

Since k A k is the maximum of k A u k over all possible unit vectors, this implies that
| | k A k.

(9.29)

If all the eigenvalues of A are real, then the spectral radius is the maximum of these
eigenvalue moduli, it too is bounded by k A k, proving (9.27).
If A has complex eigenvalues, then we need to work a little harder. Let = r e i be
ccomplex eigenvalue with complex eigenvector z = u + i v. Note that
A z = z = r e i (u + i v).
Define

m = min k Re e i z k = k (cos )v (sin )w k 0 2 .

(9.30)

Since the indicated subset is a closed curve that does not go through the origin , the
minimal value m > 0. Let 0 denote the value of the angle that produces the minimum,
so

m = k (cos 0 )v (sin 0 )w k = k Re e i 0 z k.

Define the real unit vector

Re e i 0 z
(cos 0 )v (sin 0 )w
u=
=
,
so that
k u k = 1.
m
m
Then

1
1
r
Au =
Re e i 0 A z =
Re r e i 0 e i ) z =
Re e i (0 +) z .
m
m
m

Therefore, using the fact that m is the minimal value in (9.30),


k Au k =

r
k Re e i (0 +) z k r = | |.
m

(9.31)

Again, since k A k is the maximum of k A u k over all possible unit vectors, (9.31) implies
that the inequality (9.29) also holds for complex eigenvalues, completing the proof of the
theorem.
Q.E.D.
Explicit Formulae
Let us now determine the explicit formulae for the matrix norms corresponding to
our most important vector norms. See Example 3.14 for the basic definitions. Let us first
justify our earlier formula (9.33) for the L matrix norm.

This relies on the fact that u, v are linearly independent; see Exercise .

3/7/03

338

c 2003

Peter J. Olver

Definition 9.20. The ith absolute row sum of a matrix A is the sum of the absolute
values (moduli) of the entries in the ith row:
si = | ai1 | + + | ain | =

n
X

j =1

| aij |.

(9.32)

Proposition 9.21. The L matrix norm of a matrix A is equal to the maximal


absolute row sum:

(9.33)
| aij | 1 i n .
k A k = max{s1 , . . . , sn } = max

j =1
Proof : Let s = max{s1 , . . . , sn } denote the right hand side of (9.32). Given any
v R n , we compute

n
n
X
X

k A v k = max
aij vj
max
| aij vj |

j =1
j =1

| aij |
max | vj | = s k v k .
max

j =1

Therefore, k A k s.
On the other hand, suppose the maximal absolute row sum occurs at row i, so
si =

n
X

j =1

| aij | = s.

(9.34)

Let u be defined so that uj = +1 if aij > 0, while uj = 1 if aij < 0. Then k u k = 1.


Moreover, the ith entry of A u is equal to the ith row sum (9.34). This implies that
k A k k A u k ,
which completes the proof.

Q.E.D.

Corollary 9.22. If A has maximal absolute row sum strictly less than 1, then
k A k < 1 and hence A is a convergent matrix.
This is an immediate consequence of Lemma 9.18.
Example 9.23. Consider the symmetric matrix

!
1
1

2
3
A=
.
1
31
4
Its two absolute row sums are
1 1 5
+ = ,
2
3
6
3/7/03

1 1
+ =
3

339

7
12 ,
c 2003

Peter J. Olver

and so
k A k = max

7
6 , 12

5
6

.83333 . . . .

Since the norm is less than 1, A is a convergent matrix. Indeed, its eigenvalues are

9 73
9 + 73
.7310 . . . ,
2 =
.0190 . . . ,
1 =
24
24
and hence the spectral radius is

9 + 73
.7310 . . . ,
(A) =
24
which is slightly smaller than its norm.
The row sum test for convergence is not always conclusive. For example, the matrix

!
1
3

5
A= 2
has matrix norm
k A k = 11
10 > 1.
1
4

3
5

On the other hand, its eigenvalues are (15

601 )/40, and hence its spectral radius is

15 + 601
.98788 . . . ,
(A) =
40

which implies that A is (just barely) convergent, even though its matrix norm is larger
than 1.
The Euclidean matrix norm is a bit harder to compute.
Proposition 9.24. The matrix norm corresponding to the Euclidean norm is given
by
k A k2 =
Proof : Note first that

q
(AT A).

(9.35)

k A v k2 = (A v)T A v = vT (AT A)v = vT K v,


where K = AT A is the symmetric, positive semi-definite Gram matrix (3.43) whose entries
are the dot products of the respective columns of A. The Spectral Theorem 8.25 implies
that we can diagonalize K = Q QT , where Q is an orthogonal matrix whose columns are
the orthonormal eigenvectors of K, while is a diagonal matrix whose diagonal entries
are the eigenvalues 1 2 n 0 of K. As a result, we rewrite the associated
quadratic form in the diagonal form (8.39):
2

k Av k = v Kv = v Q Q v = y y =
3/7/03

340

n
X

i yi2 ,

(9.36)

i=1
c 2003

Peter J. Olver

where y = Q v. In particular, if k u k = 1, and we set w = Q u, then k y k = 1 also due to


the orthogonality of Q. Since 1 is the largest eigenvalue, it is equal to the spectral radius
1 = (K). The preceding formula implies that
2

k A v k 1

n
X

yi2 = 1 = (K).

i=1

Moreover, taking w = e1 , so u = QT w, implies that .


k A u k21 = 1 = (K),
and hence
k A k2 = max { k A u k | k u k = 1 } = 1 = (K),
which proves the result.

Q.E.D.

Corollary 9.25. If A is symmetric, its Euclidean matrix norm is equal to its spectral
radius.
Proof : In this case, (AT A) = (A2 ) = (A)2 . The second equality follows since,
according to Exercise , the eigenvalues of A2 are just the squares of the eigenvalues of
A.
Q.E.D.

0 13 31
1
1 appearing in the Jacobi
Example 9.26. Consider the matrix A = 4

0
2
2
5

1
5

0
iteration schemer for the system (9.49). We compute

0.2225 0.0800
0.1250
AT A = 0.0800 0.1511 0.1111 ,
0.1250 0.1111 0.3611

which has eigenvalues 1 = 0.4472, 2 = 0.26653 = 0.0210. Thus, its spectral radius is
(AT A) = 0.4472, and so the Euclidean matrix norm of A is
q
k A k2 = (AT A) = 0.6687,
proving once again that A is a convergent matrix. Note that, as always, the matrix norm
overestimates the spectral radius (A) = .5.

Unfortunately, as we discovered in Example 9.23, there exist convergent matrices


such that (A) < 1 and yet have matrix norm k A k 1. In such cases, we will not be
able to predict the convergence of the iterative scheme based on the matrix, although we
would expect the convergence to be quite slow. Thus matrix norms are not a foolproof
determinator of convergence. Although this might happen in one particular matrix norm,
it turns out that one can always find a matrix norm which is less than 1. A proof of this
result can be found in [X].
3/7/03

341

c 2003

Peter J. Olver

Theorem 9.27. Let A have spectral radius (A). If > 0 is any positive number,
then there exists a matrix norm k k such that
(A) k A k < (A) + .

(9.37)

Corollary 9.28. If A is a convergent matrix, then there exists a matrix norm such
that k A k < 1.
Proof : By definition, A is convergent if and only if (A) < 1. Choose > 0 such that
(A) + < 1. Any norm that satisfies (9.37) has the desired property.
Q.E.D.
The Gerschgorin Circle Theorem
Although it is not so easy to precisely locate the eigenvalues of a matrix, there is
a relatively simple but very useful result that will often localize them in the complex
plane. The Gerschgorin Circle Theorem serves to restrict the eigenvalues to a certain welldefined region in the complex plane. In favorable situations, this information is sufficient
to determine convergence and other desired properties of the matrix.
Theorem 9.29. Let A be an nn matrix, either real or complex. For each 1 i n,
define the closed circular Gerschgorin disk
Di = { | z aii | ri | z C } ,
Let D =

n
[

i=1

where

ri =

n
X
i=1
i6=j

| aij |.

(9.38)

C denote the union of the Gerschgorin disks. Then all real and complex

eigenvalues of A lie inside the Gerschgorin domain D.


Thus, the ith Gerschgorin disk Di is centered at the ith diagonal entry aii , and has
radius ri equal to the sum of the absolute values of the off-diagonal entries that are in the
ith row of A.
Example 9.30. The matrix

2
A= 1
1

has the following Gerschgorin disks:


D1 = { | z 2 | 1 } ,

1 0
4 1
1 3

D2 = { | z 4 | 2 } ,

D3 = { | z + 3 | 2 } ,

which are plotted in Figure 9.2. The eigenvalues of A are


1 = 3,

2 = 3.1623 . . . ,

3 = 3.1623 . . . .

We observe that 1 , 2 both lie in D1 and D2 , while 3 lies in D3 , and so all three
eigenvalues are in the Gerschgorin domain D = D1 D2 D3 .
3/7/03

342

c 2003

Peter J. Olver

3
2
1
-6

-4

-2

-1
-2
-3

Figure 9.2.

Gerschgorin Disks and Eigenvalues.

Proof of Theorem 9.29 : Let v be an eigenvector of A with eigenvalue . Let u =


v/k v k be the corresponding unit eigenvector with respect to the L norm, so

k u k = max | u1 |, . . . , | un | = 1.

Let ui be an entry of u that achieves the maximum: | ui | = 1. Writing out the eigenvalue
equation A u = u in components, we find
n
X

aij uj = ui ,

which we rewrite as

j =1

n
X
j=1
j6=i

aij uj = ( aii ) ui .

Therefore,

| aii | | ui |
aij uj
j6=i

X
X


|
a
|
|
u
|

| aij | = ri .
ij
j

j6=i
j6=i

Since we chose ui so that | ui | = 1, we conclude that satisfies


| aii | ri ,
and hence Di D belongs to the ith Gerschgorin disk.

Q.E.D.

The Gerschgorin Theorem 9.29 can be used to give a simple, direct proof of Corollary 9.22 If A is any matrix, then the modulus of all points z D i contained in its ith
Gerschgorin disk is bounded by the ith absolute row sum,
| z | | z tii | + | tii | ri + | tii | = si ,
where the final equality follows by comparison of (9.38) and (9.32). Thus, every point
z D in the Gerschgorin set has modulus
| z | max{s1 , . . . , sn } = k A k ,
3/7/03

343

c 2003

Peter J. Olver

bounded by the maximal row sum. Since all eigenvalues j of A are contained in D, they
satisfy
| j | k A k ,
and hence
(A) k A k .
(9.39)
Under the theorems hypothesis, 1 > k A k (A), and hence A is convergent.
As a second application, we give a simple direct test for invertibility of a matrix, that
does not rely on Gaussian elimination or computing determinants. Recall that a matrix is
nonsingular if and only if it does not have a zero eigenvalue. Thus, if 0 does not belong
to its Gerschgorin domain, then the matrix is invertible. This condition requires that the
matrix have large diagonal entries, as quantified by the following definition.
Definition 9.31. A square matrix A is called strictly diagonally dominant if
| aii | >

n
X
i=1
i6=j

| aij |,

for all

i = 1, . . . , n.

(9.40)

In other words, for A to be diagonally dominant, its diagonal entry must be larger, in
absolute
the sum of all the other entries in its row. For example, the matrix
value, than
3
1 1
A = 1 4 2 is strictly diagonally dominant since
2 1 5
| 3 | > | 1 | + | 1 |,

| 4 | > | 1 | + | 2 |,

| 5 | > | 2 | + | 1 |.

Diagonally dominant matrices arise in many applications, particularly in finite difference and finite element methods for solving boundary value problems. As we shall see,
they are the most common class of matrices to which iterative solution methods can be
successfully applied.
Proposition 9.32. A strictly diagonally dominant matrix is nonsingular.
Proof : The diagonal dominance inequalities (9.40) imply that the radius of the i th
Gershgorin disk is strictly less than the modulus of its center:
ri < normaii .
Thus, the disk cannot contain 0; indeed, if z Di , then, by the triangle inequality
ri > | z aii | | aii | | z | > ri | z |,

and hence

| z | > 0.

Thus, 0 6 D does not lie in the Gershgorin domain and hence cannot be an eigenvalue.
Q.E.D.
Warning: The converse is obviously not true. There are plenty of nonsingular matrices
that are not diagonally dominant.
3/7/03

344

c 2003

Peter J. Olver

9.4. Markov Processes.


A discrete process in which the probability of a system being in a particular state during a given time period depends only its state in the immediately preceding time period is
known as a Markov chain, in honor of the pioneering studies of the Russian mathematician
Andrei Markov. Markov chains are described by linear iterative systems whose coefficient
matrices have a special form, and hence can be analyzed by our eigenvalue methods.
For example, suppose you are interested in predicting whether the weather in your
city on a particular day will be either sunny or cloudy. Consulting weather records over
the past decade, you determine that
(i ) If today is sunny, there is a 70% chance that tomorrow will also be sunny,
(ii ) But, if today is cloudy, the chances are 80% that tomorrow is also cloudy.
Question: given that today is sunny, what is the probability that next Saturdays weather
will also be sunny?
To mathematically formulate this process, we let s(k) denote the probability that day
k is sunny and c(k) the probability that it is cloudy. If we assume that these are the only
possibilities, then the individual probabilities must sum to 1, so
s(k) + c(k) = 1.
According to our data, the probability that the next day is sunny or cloudy is expressed
by the equations
s(k+1) = .7 s(k) + .2 c(k) ,

c(k+1) = .3 s(k) + .8 c(k) .

(9.41)

Indeed, day k + 1 could be sunny either if day k was with a 70% chance or, if day
k was cloudy, there is still a 20% chance of day k + 1 being sunny. We rewrite (9.41) in a
more convenient matrix form:

(k)
.7 .2
s
(k+1)
(k)
(k)
u
=Tu ,
where
T =
,
u =
.
(9.42)
.3 .8
c(k)
In a Markov process, the vector of probabilities u(k) is known as the k th state vector and the
matrix T is known as the transition matrix , whose entries fix the transition probabilities
between the states.
T
By assumption, our initial state vector is u(0) = ( 1, 0 ) , since we know for certain
that today is sunny. Rounding off to three decimal places, the subsequent state vectors
are


0.438
0.475
0.55
.7
(4)
(3)
(2)
(1)
,
,
u =
,
u =
,
u =
u =
0.563
0.525
0.45
.3

0.402
0.405
0.410
0.419
(8)
(7)
(6)
(5)
.
,
u =
,
u =
,
u =
u =
0.598
0.595
0.591
0.581
T

The iterates converge fairly rapidly to ( .4, .6 ) , which is a fixed point for the iterative
system (9.42). Thus, in the long run, 40% of the days will be sunny and 60% will be
cloudy. Let us explain why this happens.
3/7/03

345

c 2003

Peter J. Olver

Figure 9.3.

Probability Vectors in R 3 .
T

Definition 9.33. A vector u = ( u1 , u2 , . . . , un ) R n is called a probability vector


if all its individual entries 0 ui 1 lie between 0 and 1, and, moreover, the sum of its
entries is u1 + + un = 1.
For example, the possible probability vectors u Rx3 fill the equilateral triangle
illustrated in Figure 9.3. We interpret the entry ui of a probability vector as the probability
the system is in state number i. The fact that the entries add up to 1 means that they
represent a complete list of probabilities for the possible states of the system.
Remark : Any nonzero vector 0 6= v = v1 , . . . , vn with all non-negative entries: vi 0
for i = 1, . . . , n, can be converted into a probability vector
u=

v
v1 + + v n

(9.43)
T

by dividing by the sum of its entries. For example, if v = ( 3, 2, 0, 1 ) , then u =


1 1

1 T
is the corresponding probability vector.
2 , 3 , 0, 6
In general, a Markov chain is represented by a first order linear iterative system

(9.44)

u(k+1) = T u(k) .
The transition matrix

T = tij ),

0 tij 1,

t1j + + tnj = 1,

(9.45)

contains all the transitional probabilities. The entry tij represents the probability that
the system will switch from state j to state i. (Note the reversal of indices.) Since this
covers all possible transitions, the column sums of the transition matrix are all equal to 1.
An easy Exercise shows that if u(k) is a valid probability vector, so is u(k+1) = T u(k) .
Thus, the solution u(k) = T k u(0) to the Markov process represents a sequence or chain
of probability vectors.
Let us now investigate the convergence of the Markov chain.
3/7/03

346

c 2003

Peter J. Olver

Definition 9.34. A transition matrix (9.45) is regular if every entry of some power
T is strictly positive.
k

Since the column sums of a transition matrix are all 1, it can be proved that the entries
of T k are always between 0 and 1; see Exercise . Thus, the regularity condition just
requires T k to have no zero entries. In particular, if T itself has no transition probabilities
equal to 0, then it is regular.
Remark : The term regular transition matrix is not the same as our earlier term
regular matrix, which was used to describe matrices with an L U factorization.
The asymptotic behavior of a regular Markov chain is governed by the following key
result.
Theorem 9.35. If T is a regular transition matrix, then it admits a unique probability eigenvector u? with eigenvalue 1 = 1. Moreover, any Markov chain with coefficient
matrix T will converge to the distinguished probability vector: u (k) u? as k .
The proof of this result appears at the end of this section.
Example 9.36. For the weather transition matrix (9.42), the eigenvalues and eigenvectors are

!
!
2
1
2 = .5,
v2 =
.
1 = 1,
v1 = 3 ,
1
1
The first eigenvector is then converted into a probability vector via (9.43):
! !
2
2
1
3
5
u? = u 1 =
=
.
3
1 + 23 1
5

This distinguished probability vector represents the final asymptotic state of the system
after many iterations, no matter what the initial state. Thus, our earlier observation that
about 40% of the days will be sunny and 60% will be cloudy holds no matter what the
initial weather is.
Example 9.37. A taxi company in Minnesota serves the cities of Minneapolis and
St. Paul, and the nearby suburbs. Records indicate that, on average, 10% of the customers
taking a taxi in Minneapolis go to St. Paul and 30% go to the suburbs. Customers alighting
in St. Paul have a 30% chance of going to Minneapolis and 30% chance of going to the
suburbs, while suburban customers choose Minneapolis 40% of the time and St. Paul 30%
of the time. The owner of the taxi company is interested in knowing where the taxis
will end up, on average. We write this as a Markov process. The entries of the state
(k)
(k)
(k)
vector u(k) = (u1 , u2 , u3 )T tell what proportion of the taxi fleet is, respectively, in
Minneapolis, St. Paul and the suburbs. Using the data, we construct the relevant transition
matrix

.6 .3 .4
T = .1 .4 .3 .
.3 .3 .3
3/7/03

347

c 2003

Peter J. Olver

1.5

0.5

-1.5

-1

-0.5

0.5

1.5

-0.5

-1

-1.5

Figure 9.4.

Gerschgorin Disks for a Transition Matrix.

Note that T regular since it has no zero entries. The probability eigenvector
u? = ( 0.471429 . . . , 0.228571 . . . , 0.3 )

corresponding to the unit eigenvalue 1 = 1 is found by first solving the linear system
(T I )v = 0 and then converting the solution v into a valid probability vector by
use of formula (9.43). According to Theorem 9.35, no matter how the taxis are initially
distributed, ultimately about 47% of the taxis will be in Minneapolis, 23% in St. Paul,
and 30% in the suburbs. This can be confirmed by running numerical experiments on the
system.
Remark : The convergence rate of the Markov chain to its steady state is governed by
the second largest eigenvalue 2 < 1. The closer 2 is to 0, the faster the process converges.
In the taxi example, 2 = .3 (and 3 = 0) and so the convergence to steady state is fairly
rapid.
Proof of Theorem 9.35 : We begin the proof by replacing T by its transpose M = T T ,
keeping in mind that every eigenvalue of T is also an eigenvalue of M ; see Exercise for
details. The conditions (9.45) tell us that the matrix M has entries 0 m ij = tji 1,
n
X
mij = 1 of M are all equal to 1. Since
and, moreover, the (absolute) row sums si =
i=1

M k = (T k )T , regularity of T implies that some power M k has all positive entries.


T
According to Exercise , if z = ( 1, . . . , 1 ) is the column vector all of whose entries
are equal to 1, then M z is the vector of row sums for the matrix M . A transition matrix

Theorem 9.35 guarantees an eigenvector v with all non-negative entries.

3/7/03

348

c 2003

Peter J. Olver

T has all its column sums all equal to 1, and hence all the row sums of its transpose M
are also all equal to 1. Therefore, M z = z, which implies that z is an eigenvector of M
with eigenvalue 1 = 1. Thus, T also has 1 as an eigenvalue (although it corresponds to a
different eigenvector).
Let us next prove that 1 = 1 is a simple eigenvalue. This is equivalent to the
statement that the only vectors satisfying M v = v are those with all equal entries v 1 =
= vn = a, and hence v = a z is a scalar multiple of the particular eigenvector z. Let
us first prove this assuming all of the entries of M are positive, and so 0 < m ij = tji < 1
for all i, j. Suppose v is an eigenvector with not all equal entries. Let v k be the mimimal
entry of v, so vk vi for all i 6= k and at least one of those inequalities is strict. Then the
k th entry of the eigenvector equation v = M v is
vk =

n
X

mkj vj <

n
X

mkj vk = vk ,

j =1

j =1

where the strict inequality follows from the positivity of the entries of M , and the final
equality follows from the fact that M has unit row sums. Thus, we are led to a contradiction, and the claim follows. If M has one or more 0 entries, but M k has all positive entries,
then we apply the previous argument, using the fact that M k v = v whenever M v = v.
Finally, let us prove that all the other eigenvalues of M are less than 1 in modulus.
For this we appeal to the Gerschgorin Circle Theorem 9.29. The Gerschgorin disk D i is
centered at mii and has radius ri = si mii = 1 mii . Thus the disk lies strictly inside
the open unit disk | z | < 1 except for a single boundary point at z = 1; see Figure 9.4. The
Circle Theorem 9.29 implies that all eigenvalues except the unit eigenvalue = 1 must lie
strictly inside the unit disk, and so | j | < 1 for j 2.
Therefore, the matrix M , and, hence, also T satisfy the hypotheses of Theorem 9.14.
We conclude that the iterates u(k) = T k u(0) u? converge to a multiple of the unit
eigenvector of T . If the initial condition u(0) is a probability vector, then so is every
subsequent state vector u(k) , and so their limit u? must also be a probability vector. This
completes the proof of the theorem.
Q.E.D.

9.5. Iterative Solution of Linear Systems.


In this section, we consider iterative approaches to the solution of linear systems
Au = b

(9.46)

of n equations in n unknowns. The resulting algorithms will provide an attractive alternative to Gaussian elimination, particularly for the large, sparse systems arising in the
numerical solution to differential equations. One major advantage of an iterative technique
is that it gives progressively more and more accurate approximations to the solution, and
hence, by prolonging the iterations, one can compute the solution to any desired order
of accuracy . Moreover, even performing just a few iterations may produce a reasonable

Although the round-off errors due to the finite precision of the computer will eventually
interfere in the computations.

3/7/03

349

c 2003

Peter J. Olver

approximation to the true solution in stark contrast to Gaussian elimination, where one
must continue the algorithm through to the bitter end before any useful information can
be extracted. A partially completed Gaussian elimination is of scant use! On the other
hand, specific iterative schemes are not universally applicable to all linear systems, and
their design relies upon the detailed structure of the coefficient matrix A.
We shall be attempting to solving (9.46) by an iterative system of the form
u(k+1) = T u(k) + c,

u(0) = u0 ,

(9.47)

where T is a fixed n n matrix and c a fixed vector. This is a slight generalization of a


linear iterative system (9.1), in that the right hand side is now an affine function of u (k) . If
the solutions to the affine iterative system converge, u(k) u? as k , then u? solves
the fixed-point equation
u? = T u? + c.
(9.48)
Indeed, both u(k) and u(k+1) in (9.47) converge to the same u? , and so the limiting fixed
point equation (9.48) immediately follows. Thus we need to design our system so that
(a) The solution to the fixed-point system (9.48) is the same as the solution to the
original system (9.46), and
(b) The iterates defined by (9.47) converge to the solution.
Before exploring these issues in depth, let us begin with a simple example.
Example 9.38. Consider the linear system
3 x + y z = 3,

x 4 y + 2 z = 1,

2 x y + 5 z = 2,

(9.49)

which we rewrite in matrix form A u = b, with coefficient matrix

3
1 1
x
3
A = 1 4 2
and
u = y ,
b = 1 .
2 1 5
z
2
One easy way to rewrite the system

2
T = I A = 1
2

in fixed-point form (9.48) is to set

3
1 1
c = b = 1 .
5 2 ,
2
1 4

(9.50)

Clearly, A u = b if and only if T u + b = ( I A)u + b = u, and hence the fixed


point coincides with the solution to the original system. The resulting iterative system
u(k+1) = T u(k) + c has the explicit form
x(k+1) = 2 x(k) y (k) + z (k) + 3,

y (k+1) = x(k) + 5 y (k) 2 z (k) 1,


z (k+1) = 2 x(k) + y (k) 4 z (k) + 2.

Another possibility is to solve the first equation in (9.49) for x, the second for y and
the third for z, so that
x = 13 y +
3/7/03

1
3

z + 1,

y=

1
4

x+

350

1
2

z + 41 ,

z=

2
5

x+

1
5

y + 25 .

c 2003

Peter J. Olver

u(k+1) = T u(k) + b

0
1
2
3
4
5
6
7
8
9
10
11

0
0
0
3
1
2
0
13
1
15
64
7
30
322
4
261
1633
244
870
7939
133
6069
40300
5665
22500
196240
5500
145743
992701
129238
571980
4850773
184261
3522555 24457324 2969767

u(k+1) = Tb u(k) + b
c

0
1
1.05
1.05
1.0075
1.005
0.9986
1.0004
0.9995
1.0001
0.9999
1.0000

0
0.25
0.7
0.9375
0.9925
1.00562
1.002
1.0012
1.0000
1.0001
0.9999
1.0000

0
0.4
0.85
0.96
1.0075
1.0015
1.0031
0.9999
1.0004
0.9998
1.0001
1.0000

The solution to this fixed point system also coincide with that of the original linear system.
The corresponding iteration takes the form
x(k+1) = 13 y (k) +
y (k+1) =

z (k+1) =

1
4
2
5

x(k) +
x(k) +

1 (k)
+ 1,
3 z
1 (k)
+ 14 ,
2 z
1 (k)
+ 25 .
5 y

(9.51)

In matrix notation, this becomes


u(k+1) = Tb u(k) + b
c,

where

Tb = 14
2
5

13
0
1
5

1
3
1
2


1
1
b
c = 4 .

(9.52)

2
5

Do the resulting iterative schemes (9.47) converge to the solution x = y = z = 1? The


results, starting with initial guess u(0) = (0, 0, 0), appear in the accompanying table.
For the first scheme, the answer is no the iterations become sucessively wilder and
wilder. Indeed, this occurs no matter how close the initial guess is to the actual solution
unless it happens to be exactly equal: u(0) = u? . (And even then, numerical errors
could creep in and send the iterations off to .) In the second case, the convergence is
quite good, and it does not take too long, even starting from a bad initial guess, to obtain
an accurate approximation to the solution.
Of course, in such a small example, it would be silly to use iteration, when Gaussian
elimination can be done by hand and produces the solution almost immediately. However,
we use the small examples for illustrative purposes, reserving the actual application of the
iterative methods to large linear systems arising in applications.
3/7/03

351

c 2003

Peter J. Olver

The convergence of solutions to (9.47) to the fixed point u? is based on the behavior
of the error vectors
e(k) = u(k) u? ,
(9.53)
which measure how close the iterates are to the actual solution. Let us find out how the
successive error vectors are related. We compute
e(k+1) = u(k+1) u? = (T u(k) + a) (T u? + a) = T (u(k) u? ) = T e(k) .
Therefore, the error vectors satisfy the linear iterative system
e(k+1) = T e(k)

(9.54)

with the same coefficient matrix T , and hence, e(k) = T k e(0) . Now, the solutions to (9.47)
converge to the fixed point, u(k) u? , if and only if the error vectors e(k) 0 as k .
Consequently, Lemma 9.9 implies the following key result.
Proposition 9.39. The iterative system (9.47) will converge to the solution to (9.48)
if and only if T is a convergent matrix, i.e., (T ) < 1.
Moreover, the spectral radius (T ) of the coefficient matrix will govern the speed of
convergence. The goal is to construct an iterative scheme whose coefficient matrix has as
small a spectral radius as possible. At the very least, the spectral radius must be less than
1. For example, in the two iterative schemes presented in Example 9.38, the spectral radii
of the coefficient matrices are found to be
(T ) = 4.9675 . . . ,

( Tb ) = 0.5.

Therefore, T is not a convergent matrix, which explains the behavior of its iterates, whereas
Tb is convergent, and one expects the error to roughly decrease by a factor of 21 with each
new iterate.
The Jacobi Method

The first general iterative method for solving linear systems is based on the same
simple idea used in our illustrative Example 9.38. Namely, we solve the i th equation in the
system A u = b, which is
n
X
aij uj = bi ,
j =1

for the ith variable. To do this, we need to assume that all the diagonal entries of A are
nonzero: aii 6= 0. The result is
1
ui =
aii
where

3/7/03

a
ij ,
aii
tij =

0,

n
X

i6=j = 1

n
X
bi
aij uj +
tij uj + ci ,
=
aii
j =1

i 6= j,
i = j,
352

and

(9.55)

ci =

bi
.
aii

c 2003

(9.56)

Peter J. Olver

Equation (9.55) can be rewritten in fixed point form


u = T u + c,

(9.57)

and forms the basis of the Jacobi method


u(k+1) = T u(k) + c,

u(0) = u0 ,

(9.58)

named after the influential nineteenth century German mathematician Carl Jacobi. The
explicit form of the Jacobi iterative scheme is
(k+1)

ui

n
X

1
aii

(k)

aij uj

i6=j = 1

bi
.
aii

(9.59)

In matrix language, what we are doing is splitting off the diagonal part of A and
inverting it. Let us rederive the Jacobi method in a direct matrix form. We begin by
decomposing the coefficient matrix
A = L + D + U,

(9.60)

into the sum of a strictly lower triangular matrix L, a diagonal matrix D, and a strictly
upper triangular matrix U , each of which is uniquely specified; see Exercise . For example,
in the case of the coefficient matrix

3
1 1
A = 1 4 2 ,
(9.61)
2 1 5
the decomposition (9.60) yields

0
0 0
L= 1
0 0,
2 1 0

D= 0
0

0 0
4 0 ,
0 5

U= 0
0

1 1
0 2 .
0 0

Warning: The L, D, U in the elementary additive decomposition (9.60) have nothing


to do with the L, D, U in factorizations arising from Gaussian elimination. The latter play
no role in the iterative solution methods considered here.
We then rewrite the system
A u = (L + D + U ) u = b
in the alternative form
D u = (L + U ) u + b.
The Jacobi fixed point equation (9.57) amounts to solving for
u = T u + c,
3/7/03

where

T = D 1 (L + U ),
353

c = D 1 b.
c 2003

(9.62)
Peter J. Olver

For the example (9.61), we recover the Jacobi iteration matrix by

0 31 13

1 .
T = D1 (L + U ) = 41

0
2
1
5

2
5

(9.63)

Deciding whether the Jacobi method converges for a specific matrix is not an easy
task. However, it can be shown that Jacobi iteration will always converge for matrices
that have large diagonal terms, as prescribed by Definition 9.31.
Theorem 9.40. If A is strictly diagonally dominant, then the associated Jacobi
iteration scheme converges.

Proof : We shall prove that k T k < 1, and so Corollary 9.22 implies that T is a
convergent matrix. The row sums of the Jacobi matrix T = D 1 (L + U ) are, according
to (9.56),
n
n
X
X
1
| tij | =
si =
| aij | < 1
(9.64)
|
a
|
ii
j =1
i6=j = 1

because A is strictly diagonally dominant. Thus, k T k = max{s1 , . . . , sn } < 1, and the


result follows.
Q.E.D.
Example 9.41. Consider the linear system
4 x + y + w = 1,
x + 4 y + z + v = 2,
y + 4 z + w = 1,

x + z + 4 w + v = 2,
y + w + 4 v = 1.

The Jacobi method solves the respective equations for x, y, z, w, v, leading to the iterative
scheme
x(k+1) = 14 y (k) 41 w(k) + 1,
y (k+1) = 14 x(k) 14 z (k) 14 v (k) + 2,
z (k+1) = 14 y (k) 41 w(k) 1,

(9.65)

w(k+1) = 14 x(k) 14 z (k) 14 v (k) + 2,


v (k+1) = 14 y (k) 41 w(k) + 1.

The coefficient matrix of the original system

4 1
1 4

A = 0 1

1 0
0 1
3/7/03

354

0
1
4
1
0

1
0
1
4
1

0
1

0 ,

1
4

(9.66)

c 2003

Peter J. Olver

is diagonally dominant, and so we are guaranteed that the Jacobi iterations will eventually
converge to the solution. Indeed, the Jacobi scheme takes the iterative form (9.62), with

14

T =
0

14
0

14
0

14
0

14

14

0
14

14

0
,

14

14

14

14

1
4
1
2

c= 4
.
1
2

(9.67)

1
4

Indeed, the matrix norm k T k = 43 < 1, and hence the convergence rate of the iterates
to the solution is at least .75. In fact, the spectral radius is (T ) = .6124, which is the true
convergence rate. To obtain four decimal place accuracy in the solution, we anticipate
about log(.5 104 )/ log .6124 20 iterations. Indeed, starting with x(0) = y (0) = z (0) =
w(0) = v (0) = 0, the Jacobi iterates converge to give the solution
x = .1,

y = .7,

z = .6,

w = .7,

v = .1,

to four decimal places in exactly 20 iterations.


The GaussSeidel Method
The GaussSeidel method relies on a slightly more sophisticated implementation of
the Jacobi process. To understand how it works, it will help to write out the Jacobi
iteration scheme (9.58) in gory detail:
(k)

(k+1)

(k+1)

= t21 u1 +

(k+1)

= t31 u1 + t32 u2 +

u1
u2
u3

..
.

(k)

(k)

..
.

(k)

..
.
(k)

(k)

(k)

(k)

(k)

t12 u2 + t13 u3 + + t1,n1 un1 + t1n un(k) + c1 ,


t23 u3 + + t2,n1 un1 + t2n un(k) + c2 ,
(k)

..
(k)

+ t3,n1 un1 + t3n un(k) + c3 ,


..

.
(k)

(k)

u(k+1)
= tn1 u1 + tn2 u2 + tn3 u3 + + tn,n1 un1
n

(9.68)

..
.

+ cn ,

where we are explicitly noting the fact that the diagonal entries of T vanish. Observe
that we are using the entries of u(k) to compute all of the updated values of u(k+1) .
Presumably, if the iterates u(k) are converging to the solution u? , then their individual
(k+1)
entries are also converging, and so each uj
should be a better approximation to u?j
(k)

(k+1)

than uj is. Therefore, if we begin the k th Jacobi iteration by computing u1


using
the first equation, then we are tempted to use this new value instead of the previous, less

If we use the matrix norm to estimate the spectral radius, we would overestimate our guess
of the number of iterates by log(.5 104 )/ log .75 34.

3/7/03

355

c 2003

Peter J. Olver

(k)

accurate value u1
equation

in each of the subsequent equations. In particular, we use the modified


(k+1)

u2

(k+1)

= t21 u1

(k)

+ t23 u3 + + t1n u(k)


n + c2

to update the second component of our iterate. This more accurate value should then be
(k+1)
used to update u3
, and so on. The upshot of these considerations is the GaussSeidel
iteration scheme
(k+1)

ui

(k+1)

= ti1 u1

(k+1)

+ + ti,i1 ui1

(k)

+ ti,i+1 ui+1 + + tin un(k) + ci .

(9.69)
(k+1)

At the k th stage of the iteration, we use (9.69) to compute the updated entries u1
,
(k+1)
(k+1)
u2
, . . . , un
in their numerical order. Once an entry has been updated, the new
value is immediately used in all subsequent updates.
Example 9.42. For the linear system
3 x + y z = 3,

x 4 y + 2 z = 1,

2 x y + 5 z = 2,

the Jacobi iteration method was given in (9.51). To obtain the corresponding GaussSeidel
scheme we use updated values of x, y and z as they become available. Explicitly,
x(k+1) = 13 y (k) +
y (k+1) =

z (k+1) =

1
4
2
5

1 (k)
+ 1,
3 z
x(k+1) + 21 z (k) + 41 ,
x(k+1) + 51 y (k+1) + 52 .

The resulting iterates starting with u(0) = 0 take the values

1.0000
1.1333
1.0222
u(1) = 0.5000 ,
u(2) = 0.9833 ,
u(3) = 1.0306 ,
0.9000
1.0500
1.0150

0.9977
1.0000
1.0001
u(5) = 0.9990 ,
u(6) = 0.9994 ,
u(7) = 1.0000 ,
0.9989
0.9999
1.0001

(9.70)

u(4)

u(8)

0.9948
= 1.0062 ,
0.9992

1.0000
= 1.0000 .
1.0000

The iterations have converged to the solution to 4 decimal places after only 8 iterations
as opposed to the 11 iterations required by the Jacobi method indicating that the
GaussSeidel method is converging faster.
The GaussSeidel iteration scheme is particularly suited to implementation on a serial
(k)
computer, since one can immediately replace each component ui by its updated value
(k+1)
at each step in the computation, thereby also saving on storage in the computers
ui
memory. In contrast, the Jacobi scheme requires us to retain all the old values of u (k)
until all of the new values of u(k+1) have been computed. Moreover, GaussSeidel typically
(although not always) converges faster than Jacobi, making it the iterative algorithm of
choice for serial processors. On the other hand, with the advent of parallel processing
3/7/03

356

c 2003

Peter J. Olver

machines, variants of the Jacobi scheme have been making a comeback. Whereas Gauss
Seidel necessitates performing only one entry update at a time, the Jacobi method can be
more easily parallelized.
What is GaussSeidel really up to? Let us rewrite the basic iterative equation (9.69)
by multiplying by aii and moving the terms involving u(k+1) to the left hand side. In view
of the formula (9.56) for the entries of T , the resulting equation is
(k+1)

ai1 u1

(k+1)

+ + ai,i1 ui1

(k+1)

+ aii ui

(k)

= ai,i+1 ui+1 ain un(k) + bi .

In matrix form, taking (9.60) into account, this reads


(L + D)u(k+1) = U u(k) + b,

(9.71)

and so can be viewed as a linear system of equations for u(k+1) with lower triangular
coefficient matrix L+D. Note that the fixed point of (9.71), namely (L+D)u = U u+b,
coincides with the solution to the original system A u = (L + D + U )u = b. Gauss
Seidel is merely implementing the standard forward substitution method to solve the lower
triangular system (9.71) for the next iterate:
u(k+1) = (L + D)1 U u(k) + (L + D)1 b.

(9.72)

The latter system is in our more usual iterative form


u(k+1) = Te u(k) + e
c,

where

Te = (L + D)1 U,

e
c = (L + D)1 b.

Consequently, the convergence of the GaussSeidel iterates is governed


radius of its coefficient matrix Te = (L + D)1 U .

3
1
For example, in the case of the coefficient matrix A = 1 4
2 1
ple 9.42, we have

3
0 0
0 1 1
L + D = 1 4 0 ,
U = 0 0 2 .
2 1 5
0 0 0
Hence, the GaussSeidel coefficient matrix is

Te = (L + D)1 U = 0

0.3333

0.0833
0.1500

0.3333

(9.73)

by the spectral

1
2 in Exam5

0.5833 .
0.2500

The matrix Te has eigenvalues 0 and 0.0833 0.2444 i , and hence its spectral radius is
( Te ) 0.2582. This is roughly the square of the Jacobi spectral radius of .5, which
tells us that the Gauss-Seidel iterations will converge about twice as fast to the solution,
in accordance with our earlier observation. Indeed, although examples exist where the
Jacobi method converges faster, in many practical situation, the GaussSeidel scheme
tends to converge about twice as fast.
3/7/03

357

c 2003

Peter J. Olver

General conditions guaranteeing the convergence of the GaussSeidel method are hard
to establish. But, like the Jacobi method, diagonally dominant matrices are still handled
well.
Theorem 9.43. If A is strictly diagonally dominant, then the GaussSeidel iteration
scheme converges.
Proof : Let e(k) = u(k) u? denote the k th GaussSeidel error vector. As in (9.54),
the error vectors satisfy the homogeneous iteration
e(k+1) = Te e(k) ,

(L + D)e(k+1) = U e(k) .

or, equivalently,

We write out this equation in components,


(k+1)

ei
Let

(k+1)

= ti1 e1

(k+1)

+ ti,i1 ei1

(k)

+ ti,i+1 ei+1 + + tin en(k) .

(9.74)

(k)
m(k) = k e(k) k = max | e1 |, . . . , | e(k)
n |

denote the L norm of the error vector. We claim that


m(k+1) s m(k) ,

(9.75)

where s = k T k denotes the L matrix norm of the Jacobi matrix. To prove Theorem 9.40, we showed that diagonal dominance of A implies that s < 1. Using this fact and
the claim (9.75), we infer that
m(k) sk m(0) 0,

as

k ,

and hence e(k) 0, demonstrating the theorem.


To prove (9.75), we use induction on i = 1, . . . , n. Thus, our induction hypothesis is
that
(k+1)
| ej
| s m(k) m(k)
for
j = 1, . . . , i 1.
Moreover, since m(k) = k e(k) k , we have
(k)
| e(k)
n |m

for all
(k+1)

We use these two inequalities to estimate | ei


(k+1)

| ei

(k+1)

j = 1, . . . , n.
| from (9.74):

(k+1)

(k)

| | ti1 | | e1
| + + | ti,i1 | | ei1 | + | ti,i+1 | | ei+1 | + + | tin | | en(k) |

| ti1 | + + | tin | m(k) = si m(k) s m(k) ,

which completes the induction step. Finally, the maximum

(k+1)

m(k+1) = max | e1
|, . . . , | e(k+1)
|
s m(k)
n

also satisfies the same bound, and hence (9.75) follows.


3/7/03

358

c 2003

Q.E.D.
Peter J. Olver

Example 9.44. For the linear system considered in Example 9.41, the GaussSeidel
iterations take the form
x(k+1) = 41 y (k) 41 w(k) + 1,

y (k+1) = 14 x(k+1) 41 z (k) 14 v (k) + 2,


z (k+1) = 14 y (k+1) 41 w(k) 1,

w(k+1) = 14 x(k+1) 41 z (k+1) 41 v (k) + 2,


v (k+1) = 14 y (k+1) 41 w(k+1) + 1.

Starting with x(0) = y (0) = z (0) = w(0) = v (0) = 0, the GaussSeidel iterates converge
to the solution x = .1, y = .7, z = .6, w = .7, v = .1, to four decimal places in 11
iterations, again rougly twice as fast as the Jacobi scheme.
Indeed, the convergence rate is governed by the Gauss-Seidel coefficient matrix Te
based on the coefficient matrix (9.66), which is

4
1

1
0

0
4
1
0
1

0
0
4
1
0

0
0
0
4
1

0 1 0
0
0

0
0

0 0
4
0

1
0
0
0
0

0
1
0
0
0

1
0
1
0
0

0
0
0
1

= 0
0

1 0
0
0

0.2500
0.0625
0.0156
0.0664
0.0322

0
0.2500
0.0625
0.0156
0.0664

0.2500
0.0625
0.2656
0.1289
0.0479

0
0.2500

.
0.0625

0.2656
0.1289

Its spectral radius is ( Te ) = .3936, which is, as in the previous example, approximately
the square of the spectral radius of the Jacobi coefficient matrix, which explains its doubly
fast convergence rate.
Successive OverRelaxation (SOR)
As we know, the smaller the spectral radius (or matrix norm) of the coefficient matrix,
the faster the convergence of the iterative method. Thus, there has always been a premium
on the design of new methods for accelerating the convergence. In his 1950 thesis, the
American mathematician David Young discovered a simple modification of the Jacobi and
GaussSeidel methods that can result in a dramatic speed up in the rate of convergence.
The method, known as successive over-relaxation, and often abbreviated as SOR, has
become the iterative method of choice in many modern applications.
In practice, determining the optimal iterative scheme to solve a given linear system is
as hard a solving the system itself. Therefore, one relies on a few tried and true techniques
for building a good iterative scheme. Every decomposition
A=M N

(9.76)

of the coefficient matrix of A u = b into the difference of two matrices leads to an equivalent
fixed point system, namely
M u = N u + b.
(9.77)
As long as we take M to be invertible, we can place the system in fixed point form
u = M 1 N u + M 1 b = T u + c,
3/7/03

359

where

T = M 1 N,

c = M 1 b.

c 2003

Peter J. Olver

We are free to choose any such M , which specifies N = A M . However, for the resulting
iterative scheme u(k+1) = T u(k) + c to be practical we ask that
(a) T = M 1 N is a convergent matrix, and
(b) M can be easily inverted.
The second requirement ensures that the iterative system
M u(k+1) = N u(k) + b

(9.78)

can be solved for u(k+1) with minimal computational effort. Typically, this requires that
M be either a diagonal matrix, in which case the inversion is immediate, or upper or lower
triangular, in which case one uses back or forward substitution to solve for u (k+1) .
With this in mind, we now introduc the SOR method. It relies on a slight generalization of the GaussSeidel decomposition of the matrix into lower, diagonal and upper
triangular parts. The starting point is to write

A = L + D + U = L + D ( 1) D U ,
(9.79)
where 0 6= is an adjustable scalar parameter. We decompose the system A u = b as

(L + D)u = ( 1) D U u + b.
(9.80)
It turns out to be more convenient to divide (9.80) through by , and write the resulting
iterative system in the form

( L + D)u(k+1) = (1 ) D U u(k) + b,
(9.81)

where = 1/ is called the relaxation parameter . Assuming, as usual, that all diagonal
entries of A are nonzero, the matrix L + D is invertible and lower triangular matrix, and
so we can use forward substitution to solve the iterative system (9.81) to recover u (k+1) .
The explicit formula for its ith entry
(k+1)

ui

(k+1)

= ti1 u1

(k+1)

+ + ti,i1 ui1
(k)

(k)

+ (1 ) ui

(9.82)

+ ti,i+1 ui+1 + + tin u(k)


n + ci ,

where tij and ci denote the original Jacobi values (9.56). Thus, to obtain the SOR scheme
(9.82), we merely multiply the right hand side of the GaussSeidel scheme (9.69) by the
(k)
relaxation parameter and append the diagonal term (1 ) ui . In particular, if = 1,
then the SOR method reduces to the GaussSeidel method. Choosing < 1 leads to an
under-relaxed method, while > 1, known as over-relaxation, is the choice that works in
(k+1)
most practical instances. As in the GaussSeidel approach, we update the entries u i
in numerical order i = 1, . . . , n.
To analyze the convergence rate of the SOR scheme (9.81), we rewrite it in the fixed
point form
u(k+1) = T u(k) + c ,
(9.83)
where

T = T = ( L + D)1 (1 ) D U ,

3/7/03

360

c = ( L + D)1 b.
c 2003

(9.84)

Peter J. Olver

The rate of convergence of the SOR method is governed by the spectral radius of its
coefficient matrix T . The goal is to choose the relaxation parameter so as to make the
spectral radius of T as small as possible. As we will see, a clever choice of will result
in a dramatic speed up in the convergence of the iterative method. Before stating some
general facts (albeit without proof) let us analyze a simple example.

2 1
, which we write as L+D +U ,
Example 9.45. Consider the matrix A =
1 2
where

0 1
2 0
0 0
.
,
U=
,
D=
L=
0 0
0 2
1 0

0 12
1
Jacobi iteration uses the coefficient matrix T = D (L + u) = 1
. The Jacobi
0
2
spectral radius is (T ) = .5, and hence it takes, on average, roughly 3.3 log .1/ log .5
iterations to produce each new decimal place of accuracy in the solution.
The SOR scheme (9.81) takes the explicit form

2(1 )

2
0
(k+1)
u(k) + b,
u
=
0
2 (1 )
2
where GaussSeidel is the case = 1. The SOR coefficient matrix is

1

1
1
2 (1 )

2
0

2
= 1
.
T =
1
2
0
2 (1 )
2
2 (1 )
4 (2 )
To compute the eigenvalues of T , we form its characteristic equation

det(T I ) = 2 2 2 + 14 2 + (1 )2
= ( + 1)2 14 2 = 0.

(9.85)

Our goal is to choose so that


(a) Both eigenvalues are less than 1 in modulus, so | 1 |, | 2 | < 1. This is the minimial
requirement for convergence of the method.
(b) The largest eigenvalue (in modulus) is as small as possible. This will give the smallest
spectral radius for T and hence the fastest convergence rate.
By (8.29), the product of the two eigenvalues is the determinant,
1 2 = det A = (1 )2 .
If 0 or 2, then det A 1, and hence least one of the eigenvalues would have
modulus larger than 1. Thus, in order to ensure convergence, we must require 0 < < 2.
For GaussSeidel, at = 1, the eigenvalues are 1 = 0, 2 = 14 , and the spectral radius is
(T1 ) = .25. This is exactly the square of the Jacobi spectral radius, and hence the Gauss
Seidel iterates converge twice as fast it only takes, on average, about 1.65 GaussSeidel
iterations to produce a new decimal place of accuracy. It can be shown (Exercise ) that
as increases above 1, the two eigenvalues move together, the larger one decreasing in
size. They are equal when

= ? = 8 4 3 1.07.
3/7/03

361

c 2003

Peter J. Olver

At that point, 1 = 2 = .07 = (T ), which is the convergence rate of the optimal


SOR scheme. Each iteration produces slightly more than one new decimal place in the
solujtion, which represents a significant improvement over the GaussSeidel rate of .25.
It takes about twice as many GaussSeidel iterations (and four times as many Jacobi
iterations) to produce the same accuracy as this optimal SOR method.
Of course, in such a simple 2 2 example, it is not so surprising that we can construct
the optimal relaxation parameter by hand. In his 1950 thesis, cf. [128], Young found the
optimal value of the relaxation parameter for a broad class of matrices that includes most of
those arising in the finite difference and finite element numerical solutions to ordinary and
partial differential equations. For the matrices in Youngs class, the Jacobi eigenvalues
occur in signed pairs. If are a pair of eigenvalues for the Jacobi method, then the
corresponding eigenvalues of the SOR iteration matrix satisfy the quadratic equation
( + 1)2 = 2 2 .

(9.86)

If = 1, so we have standard GaussSeidel, then 2 = 2 , and so the corresponding


GaussSeidel eigenvalues are = 0, = 2 . The GaussSeidel spectral radius is therefore
the square of the Jacobi spectral radius, and so (at least for matrices in the Young class)
its iterates converge twice as fast. The quadratic equation (9.86) has the same properties
as in the 2 2 version (9.85) (which corresponds to the case = 12 ), and hence the optimal
value of will be the one at which the two roots are equal,
p
2
2 2 1 2
p
.
=
1 = 2 = 1,
which occurs when
=
2
1 + 1 2

Therefore, if J = max | | denotes the spectral radius of the Jacobi method, then the
GaussSeidel has spectral radius GS = 2J , while the SOR method with optimal relaxation
parameter
2
p
? =
,
has spectral radius
? = ? 1.
(9.87)
1 + 1 2J

For example, if J = .99, which is quite slow convergence (but common for iterative solution
of partial differential equations), then GS = 0.9801, which is twice as fast, but still quite
slow, while SOR with ? = 1.7527 has ? = 0.7527, which is dramatically faster. Indeed,
since ? (GS )14 (J )28 , it takes about 14 GaussSeidel (and 28 Jacobi) iterations to
produce the same accuracy as one SOR iteration. The fact that such a simple idea can
have such a dramatic effect on the convergence rate is amazing.
Conjugate Gradients
So far, we have learned two main classes of algorithms for solving linear systems.
The first, the direct methods based on some version of Gaussian elimination or matrix
factorization, eventually obtain the solution, but must be carried through to completition
before any useful information is obtained. The alternative, iterative methods discussed in

In Exercise , the reader is asked to complete the proof of optimality.

3/7/03

362

c 2003

Peter J. Olver

the present chapter, lead to closer and closer approximations of the solution, but never
reach the actaul value exactly. One might ask whether there are algorithms that combine
the best of both: semi-direct methods that give closer and closer approximations to the
solution, but are guaranteed to eventually terminate with the exact solution in hand.
For instance, one might ask for an algorithm that successively computes each entry
of the solution vector. This is unlikely, but if we recall that the entries of the solution are
merely its coordinates with respect to the standard basis e1 , . . . , en , then one might relax by
asking that one compute the coordinates t1 , . . . , tn of the solution vector u = t1 v1 + tn vn
with respect to some basis that is adapted to the linear system. Ideally, the basis should
be orthogonal; but orthogonality with respect to the standard Euclidean dot product is
not typically relevant for a linear system A u = b.
An even better idea is to arrange that the basis be orthogonal with respect to a nonstandard inner product. In particular, if the linear system to be solved takes the form
K u = f in which the coefficient matrix is positive definite, as occurs in many applications,
then orthogonality with respect to this inner product h v ; w i = v T K w = v K w looks
very natural. Vectors that are orthogonal with respect to the inner product induced by
the coefficient matrix K are known as conjugate vectors, which explain half the name of
the conjugate gradient algorithm, due to Hestenes and Stiefel.
The goal is to solve K u = f . We shall construct the solution by successive approximation, with the k th iterate having the form
uk = t 1 v 1 + t k v k ,
where, as advertized, the vectors v1 , . . . , vn form a Korthogonal basis. The secret is to
construct the orthogonal basis vectors cduring the course of the algorithm. We begin,
merely for convenience, with an initial guess u0 = 0 for the solution. We compute the
residual vector r0 = f K u0 . The residual vector is minus the gradient of the quadratic
function p(u), and hence indicates the direction of steepest decrease. (See Chapters A
and B and Section 18.3 for details.) Thus, we update our original guess by moving in this
direction, taking v1 = r0 as our first conjugate direction. The next guess u1 = u0 + t1 v1 ,
and we choose the parameter t1 so that the corresponding residual vector
r1 = f K u 1 = r 0 t 1 K v 1

(9.88)

is a close to 0 (in the Euclidean norm) as possible. This occurs when it is orthogonal to
r0 , and so we require
0 = r 0 r 1 = k r 0 k 2 t 1 r0 K v 1 = k r 0 k 2 t 1 v 1 K v 1 .
Therefore we set

k r 0 k2
.
t1 =
v1 K v 1

(9.89)

Our inclination is to try to update u1 by moving in the gradient direction, which is the
residual vector r1 . However, this strict gradient descent algorithm, while often useful for
minimizing truly nonlinear functions, cf. Section 18.3, turns out in most cases to converge
much too slowly in the present situation. Rather, we introduce a direction v 2 which is
3/7/03

363

c 2003

Peter J. Olver

conjugate, meaning Korthogonal to the first direction v1 = r0 . Thus, as in Gram


Schmidt, we modify the direction by setting
v2 = r 1 + s1 v1 ,
where the scalar factor s1 is determined by the orthogonality requirement
0 = h v 2 ; v 1 i = v 2 K v 1 = r 1 K v 1 + s1 v1 K v 1 .
Therefore, using (9.88) and then (9.89) and the orthogonality of r 0 and r1 ,
s1 =
We then update

r1 K v 1
1 r1 (r1 r0 )
k r 1 k2
.
=
=
v1 K v 1
t1 v 1 K v 1
k r 0 k2
u2 = u 1 + t 2 v 2 = t 1 v 1 + t 2 v 2

so as to make the corresponding residual vector


r2 = f K u 2 = r 1 t 2 K v 2
as small as possible, which is accomplished by requiring it to be orthogonal to r 1 , and so,
using the Korthogonality of v1 and v2 ,
0 = r 1 r 2 = k r 1 k 2 t 2 r1 K v 2 = k r 1 k 2 t 1 v 2 K v 2 ,

and so

t2 =

k r 1 k2
.
v2 K v 2

Continuing in this manner, at the k th stage, we have already constructed the mutually
Korthogonal directions v1 , . . . , vk1 , and solution approximation uk1 as a auitbale linear
combination thereof. The next conjugate direction is given by
vk = rk1 + sk vk1 ,
and the Korthogonality requires
sk =

k rk1 k2
.
k rk2 k2

The updated solution


uk = uk1 + tk vk = t1 v1 + + tk vk
is then to make the corresponding residual
rk = f K uk = rk1 t2 K vk
as small as possible, which requires
0 = rk1 rk = k rk1 k2 t2 rk1 K vk = k rk1 k2 tk vk K vk ,
and so
tk =
3/7/03

k rk1 k2
.
vk K v k

364

c 2003

Peter J. Olver

This completes the conjugate gradient algorithm. The only matrix operation required is a
multiplication K vk in the computation of tk ; all the other operations are fast dot products.
Unlike Gaussian elimination, the method produces a sequence of successive approximations uk to the solution. Moreover, unlike purely iterative methods, the algorithm
terminates (assuming high precision arithmetic) because as remarked at the outset, there
are at most n conjugate directions, forming an K orthogonal basis of R n , and hence
un = t1 v1 + + tn vn must be the so,ution since its residual rn = f K un is orthogonal
to all the vi , and hence must be 0.
Example 9.46.

9.6. Numerical Computation of Eigenvalues.


The importance of the eigenvalues of a square matrix for both continuous and discrete dynamical systems has been amply demonstrated in this chapter and its predecessor.
Finding the eigenvalues and associated eigenvectors is not an easy computational problem,
and there is no explicit formula for the eigenvalues of most matrices. Therefore, there is a
need for efficient, numerical approximation schemes.
We have already noted that one cannot practically compute eigenvalues of matrices by
solving the characteristic equation (8.17). Any practical numerical procedure must avoid
computation of determinants, and so requires a completely different approach. In this
section, we discuss some of the available numerical techniques for computing eigenvalues
of matrices. The most direct are based on the connections between the eigenvalues and the
high powers of a matrix. A more sophisticated technique is based on the Q R factorization
that we learned in Section 5.3, and will be presented at the end of the section.
The Power Method
We have already noted the role played by the eigenvalues and eigenvectors in the
solution to a linear iterative system. Now we are going to turn the tables, and use the
iterative system as a mechanism for approximating the eigenvalues, or, more correctly, the
largest eigenvalue. The resulting computational procedure is known as thepower method .
We assume, for simplicity, that A is a complete matrix. Let v1 , . . . , vn denote the
eigenvector basis, and 1 , . . . , n the corresponding eigenvalues. As we learned, the solution
to the linear iterative system
v(k+1) = A v(k) ,

v(0) = v,

(9.90)

is given by multiplying the initial vector v by the successive powers of the coefficient
matrix: v(k) = Ak v. If we write the initial vector in terms of the eigenvector basis
v = c 1 v1 + + c n vn ,

(9.91)

then the solution takes the explicit form given in Theorem 9.3, namely
v(k) = Ak v = c1 k1 v1 + + cn kn vn .
3/7/03

365

(9.92)
c 2003

Peter J. Olver

Suppose that A has a single real eigenvalue say 1 that is largest in magnitude,
so
| 1 | > | j |

for all

j > 1.

(9.93)

The largest eigenvalue will completely dominate the iteration (9.92). Indeed, since
| 1 |k | j |k

for all j > 1 and all k 0,

the first term in the iterative formula (9.92) will eventually be much larger than all the
rest, and so, provided c1 6= 0,
v(k) c1 k1 v1

for

k 0.

Therefore, under the assumption (9.93), the solution to the iterative system (9.90) will,
almost always, end up being a multiple of the first eigenvector of the coefficient matrix.
(k)
Moreover, the entries of v(k) are given by vi = k1 vi , and hence, as long as vi 6= 0, we
can recover the eigenvalue 1 itself by taking a ratio between any nonzero components of
succesive iterates:
(k+1)
vi
(k)
1 (k) ,
provided
vi 6= 0.
(9.94)
vi

1 2
2
Example 9.47. Consider the matrix A = 1 4 2 . As the reader can
3 9
7
check, its eigenvalues and eigenvectors are

1
0
1
1 = 3,
v1 = 1 ,
2 = 2,
v2 = 1 ,
3 = 1,
v3 = 1 .
3
1
2
T

If we repeadetly multiply the initial vector v = ( 1, 0, 0 ) by A, the resulting vectors


v(k) = Ak v are given in the accompanying table. The last column lists the ratio between
the first components of v(k) and v(k1) . These ratios are converging to the third and largest
eigenvalue 3 = 3, while v(k) is converging to a very large multiple of the corresponding
eigenvector v3 .

The success of the power method requires that A have a unique eigenvalue of maximal
modulus, | 1 | = (A), which, by definition, equals its spectral radius. The rate of convergence of the method is governed by the ratio | 2 /1 | between the next largest eigenvalue
and the dominant eigenvalue. Thus, the further the domainant eigenvalue lies away from
the others, the faster the method converges.
Since complex eigenvalues of real matrices come in complex conjugate pairs of the
same modulus, the case when the largest eigenvalue is complex is not covered, although
one can modify the method to handle such cases; see Exercise . We also had to assume
that the decomposition of the initial vector v in terms of the eigenvector basis contained a
nonzero multiple of the dominant eigenvector: c1 6= 0. This is not so easy to guarantee in
advance, although one must be quite unlucky to make such a poor choice of initial vector.
3/7/03

366

c 2003

Peter J. Olver

v(k)

k
1
2
3
4
5
6
7
8
9
10
11
12

1
7
25
79
241
727
2185
6559
19681
59047
177145
531439

1
11
17
95
209
791
2057
6815
19169
60071
175097
535535

3
27
69
255
693
2247
6429
19935
58533
178167
529389
1598415

1.
7.
3.5714
3.1600
3.0506
3.0166
3.0055
3.0018
3.0006
3.0002
3.0001
3.0000

(Of course, the stupid choice v = 0 is not counted.) Moreover, even if c1 = 0 initially,
numerical round-off error will typically come to ones rescue, since it will almost always
introduce a tiny component of the eigenvector v1 into some iterate, and this component
will eventually dominate the computation. The trick is to wait long enough for it to show
up!
Since the iterates of A are, typically, getting either very large when (A) > 1
or very small when (A) < 1 the iterated vectors will be increasingly subject to
round-off error, if not numerical over or underflow. One way to avoid this problem is to
work with unit vectors, whose entries cannot get too large, and so are less likely to cause
numerical errors in the computations. We define
u(k) =

v(k)
Ak v
=
,
k Ak v k
k v(k) k

where k k is any convenient norm the 1 and norms being slightly easier to compute
than the Euclidean norm. The unit vectors u(k) can be computed iteratively by setting
u(0) =

v
,
kvk

and

u(k+1) =

A u(k)
.
k A u(k) k

(9.95)

If the largest eigenvalue 1 > 0 is positive, then u(k) u1 will converge to one of the two
unit eigenvectors (the other one is u1 ) corresponding to the eigenvalue 1 . If 1 < 0,
then the iterates will switch back and forth between the two eigenvectors u (k) (1)k u1 .
In either case, the eigenvalue 1 is obtained as a limiting ratio between nonzero entries of
u(k) and A u(k) . If some other behavior is observed, it means that one of our assumptions
is not valid; either A has more than one (complex) eigenvalue of maximum modulus, or it
is not complete.
3/7/03

367

c 2003

Peter J. Olver

u(k)

k
1
2
3
4
5
6
7
8
9
10
11
12

0.3015
0.2335
0.3319
0.2788
0.3159
0.2919
0.3080
0.2973
0.3044
0.2996
0.3028
0.3007

0.3015
0.3669
0.2257
0.3353
0.2740
0.3176
0.2899
0.3089
0.2965
0.3048
0.2993
0.3030

0.9045
0.9005
0.9159
0.8999
0.9084
0.9022
0.9061
0.9035
0.9052
0.9041
0.9048
0.9043

1.0000
7.0000
3.5714
3.1600
3.0506
3.0166
3.0055
3.0018
3.0006
3.0002
3.0001
3.0000

Example 9.48. For the matrix considered in Example 9.47, if we multiply the initial
T
vector u(k) = ( 1, 0, 0 ) by A, the resulting vectors u(k) = Au(k1) /k Au(k1) k are given
in the table. The last column is the ratio between the first components of u (k) and Au(k) ,
which converges to the largest eigenvalue 1 = 3.
The Q R Algorithm
The power method only produces the largest eigenvalue of a matrix A. The inverse
power method of Exercise can be used to find the smallest eigenvalue. Additional eigenvalues can be found by using the shifted inverse power method of Exercise . However, if
we need to know all the eigenvalues, these methods are too time-consuming.
The most important method for simultaneously approximating all the eigenvalues is
the remarkable Q R algorithm, and first proposed in 1961, independently by Francis, [51],
and Kublanovskaya, [81]. The underlying idea is simple but surprising. First factor the
matrix
A = A 0 = Q 0 R0
into a product of an orthogonal matrix and a positive upper triangular matrix (i.e., with
positive entries along the diagonal) using the GramSchmidt orthogonalization procedure
of Theorem 5.24. Then multiply the two factors together in the wrong order ! The result
is a new matrix
A 1 = R 0 Q0 .
The next step is to factorize
A 1 = Q 1 R1
using GramSchmidt on the new matrix A1 . The algorithm proceeds by iterating the
processes of Q R factorization and then reversing the order of the factors. The general step
3/7/03

368

c 2003

Peter J. Olver

is
Ak+1 = Rk Qk = Qk+1 Rk+1 ,

(9.96)

where Qk , Rk are known from the previous step, and Qk+1 , Rk+1 are computed using the
GramSchmidt algorithm.
The astonishing fact is that, for many matrices A, the iterates
Ak V

(9.97)

converge to a upper triangular matrix V whose diagonal entries are the eigenvalues of A.
Thus, after a sufficient number of iterations, the matrices Ak will have very small entries
below the diagonal, and one can read off a complete system of approximate eigenvalues
from their diagonal entries. For each eigenvalue, the compuation of the corresponding
eigenvector can be done by solving the approprate homogeneous linear system, or by using
the shifted inverse power method of Exercise .

2 1
Example 9.49. Consider the matrix A =
. The initial A = Q0 R0 factor2 3
ization produces

2.8284 2.8284
0.7071 0.7071
.
,
R0 =
Q0 =
0
1.4142
0.7071 0.7071

4 0
. We refactorize
These are multiplied in the reverse order to give A1 = R0 Q0 =
1 1
A1 = Q1 R1 and reverse multiply to produce

0.9701 0.2425
4.1231 0.2425
Q1 =
,
R1 =
,
0.2425 0.9701
0
0.9701

4.0588 0.7647
A 2 = R 1 Q1 =
.
0.2353 0.9412
The next iteration yields

0.9983
Q2 =
0.0579

0.0579
0.9983

4.0656
,
R2 =
0

4.0178 0.9431
A 3 = R 2 Q2 =
.
0.0569 0.9822

0.7090
0.9839

Continuing in this manner, after 9 iterations we find, to four decimal places

1 0
4 1
4
Q9 =
,
R9 =
,
A10 = R9 Q9 =
0 1
0
1
0

1
1

The eigenvalues of A, namely 4 and 1, appear along the diagonal of A 10 . Additional


iterations produce very little further change, although they can be used for incrfeasing the
accuracy of the computed eigenvalues.

The precise requirements appear below.

3/7/03

369

c 2003

Peter J. Olver

If the original matrix A is symmetric, then the limiting matrix Ak is, in fact, a
diagonal matrix with the eigenvalues of A appearing along the diagonal. Moreover, in this
b =Q
b
b
case, if we recursively define Q
k
k1 Qk = Q0 Q1 Qk1 Qk , then Qk Q? have,
as their limit, the orthogonal matrix whose columns are the orthonormal eigenvector basis
of A.

2 1
0
Example 9.50. Consider the symmetric matrix A = 1 3 1 . The initial
0 1 6
A = Q0 R0 factorization produces

0.8944 0.4082 0.1826


2.2361 2.2361 0.4472
b = Q = 0.4472 0.8165 0.3651 ,
Q
R0 =
0
2.4495 3.2660 ,
0
0
0
0.4082
0.9129
0
0
5.1121

and so

3
A1 = R0 Q0 = 1.0954
0.

1.0954
0.
3.3333 2.0870 .
2.0870 4.6667

We refactorize A1 = Q1 R1 and reverse multiply to produce

0.9393 0.2734
0.2071
3.1937 2.1723 0.7158
Q1 = 0.3430 0.7488 0.5672 ,
R1 =
0
3.4565 4.3804 ,
0
0.6038 0.7972
0
0
2.5364

0.7001 0.4400 0.5623


3.7451 1.1856
0
b = 0.7001
Q
0.2686
0.6615 ,
A2 = 1.1856 5.2330 1.5314 ,
1
0.1400 0.8569 0.4962
0
1.5314 2.0219

b =Q
b Q = Q Q . Continuing in this manner, after 10
where Q
1
0 1
0 1

6.3229
1.0000 0.0067
0.

R10 =
0
Q10 = 0.0067 1.0000 0.0001 ,
0
0.
0.0001 1.0000

0.0753
6.3232 0.0224
0.
b

Q10 = 0.3128
A11 = 0.0224 3.3581 0.0002 ,
0.9468
0.
0.0002 1.3187

iterations we find

0.
0.0006 ,
1.3187

0.5667 0.8205
0.7679 0.5591 .
0.2987 0.1194

0.0647
3.3582
0

After 20 iterations, the system has settled down, and

6.3234 0.0001
0
1 0 0
R20 =
0
3.3579
0 ,
Q20 = 0 1 0 ,
0
0
1.3187
0
0 1

6.3234
0
0
0.0710 0.5672 0.8205
b = 0.3069 0.7702 0.5590 .
A21 = 0
3.3579
0 ,
Q
20
0
0
1.3187
0.9491 0.2915 0.1194

b are the
The eigenvalues of A appear along the diagonal of A21 , while the columns of Q
20
corresponding orthonormal eigenvectors.
3/7/03

370

c 2003

Peter J. Olver

Let us now justify the Q R algorithm. The explanation is closely connected with the
more primitive power method. As we saw, for most vectors v R n , the powers v(n) = An v
tend to a multiple of the dominant eigenvector v1 corresponding to the largest eigenvalue.
e does not help produce additional eigenvectors
Applying powers of A to additional vectors u
e also tend to a multiple
of A since, for the same reaons, almost always the powers of An u
of the dominant eigenvector v1 . This is why the basic power method only produces one
eigenvalue and eigenvector. To obtain the others, we need a more sophisticated idea.
When implementing the basic power method, we saw the need to keep the size of the
vectors v(n) under control, and to accomplish this we normalized them at each step of the
algorithm, with the resulting vectors converging to the unit dominant eigenvector u 1 (or
its negative).
Since the other eigenvectors are orthogonal to this one, to also prevent the iterates
of other vectors from getting closer and closer to each other, we should orthonormalize at
each step! And, indeed, this is the effect of the Q R algorithm.
(0)
(0)
Thus, we start with any orthonormal set of vectors u1 , . . . , un , which, for simplicity,
(0)
we take to be the standard basis vectors of R n , and so uj = ej . At the kth stage of the
(k)

(k)

algorithm, we set u1 , . . . , un to be the orthonormal vectors that result from applying


(k)
the GramSchmidt algorithm to the powers vj = Ak ej . In matrix language, the vectors
(k)

(k)

(k)

(k)

v1 , . . . , vn are merely the columns of Ak , and the orthonormal basis u1 , . . . , un are


b in the Q R decomposition
the columns of the orthogonal matrix Q
k
b R
b
Ak = Q
k k

(9.98)

of the k th power of A. Note that, using (9.96)


A = Q 0 R0 ,

A 2 = Q 0 R0 Q0 R0 = Q 0 Q1 R1 R0 ,

A 3 = Q 0 R0 Q0 R0 Q0 R0 = Q 0 Q1 R1 Q1 R1 R0 = Q 0 Q1 Q2 R2 R1 R0 ,
and, in general,

Rk Rk1 R1 R0 .
Ak = Q0 Q1 Qk1 Qk

(9.99)

Note that the product of orthogonal matrices is also orthogonal, and the product of upper
triangular matrices with positive diagonal entries is also upper triangular with positive
diagonal entries. Therefore, comparing (9.98), (9.99) and using the uniqueness of the Q R
factorization of a matrix, we conclude that
b = Q Q Q
Q
k
0 1
k1 Qk ,

b =R R
R
k
k k1 R1 R0 .

In order to ensure the success of the Q R algorithm, we need to impose a few restrictions on the matrix A. For simplicity we only consider real n n matrices. The key
requirement is that A possesses a complete system of n distinct eigenvalues 1 , . . . , n .
Moreover, we require that the eigenvalues are all of different magnitudes, so | i | 6= | j |
for i 6= j. This effectively excludes any real matrices with complex eigenvalues from consideration, since the complex eigenvalues always come in complex conjugate pairs, which
3/7/03

371

c 2003

Peter J. Olver

are of the same modulus. The most important class of matrices for which the algorithm
applies are the symmetric matrices, all of whose eigenvalues are real.
We order the eigenvalues so that
| 1 | > | 2 | > > | n |,

(9.100)

and so 1 is the dominant eigenvalue that would appear in the power method. Let
S = [v1 , . . . , vn ] denote the corresponding eigenvector matrix. The Diagonalization Theorem 8.19 tells us that
A = S S 1 ,

where

= diag(1 , . . . , n ).

Substituting into (9.98), we find


b R
b
Ak = S k S T = Q
k k.

We now make an additional assumption on the eigenvectors by requiring that S 1


be a regular matrix, and so, by Gaussian elimination, has a factorization S 1 = L U into
a product of special lower and upper triangular matrices. This holds generically, and is
the analog of the condition that our original vector in the ordinary power method has a
nonzero coefficient of the dominant eigenvector. Then,
b R
b 1 .
S k L = Q
k kU

Multiplying on the right by k we obtain


b R
e
S k L k = Q
k k,

where

e =R
b U 1 k
R
k
k

(9.101)

is an upper triangular matrix with positive diagonal entries.


Now consider what happens as k . Since L is lower triangular, the entries of the
matrix k L k below the diagonal are lij (j /i )k 0 since i > j, and, by (9.100),
| j /i | < 1. Its diagonal entries are all equal to 1, while it has all zero entries above the
diagonal. Therefore, in the limit,
k L k I .
Moreover, the convergence rate is governed by the ratio | 2 /1 | between the subdominant
and the domainant eigenvalues. As a consequence, in the limit as k , the left hand
side of (9.101) tends to the eigenvector matrix S.
We now apply the following easy lemma, whose proof is outlined in Exercise .
Lemma 9.51. The products of orthogonal and positive upper triangular matrices
have a limit, lim Qk Rk = S if and only if the individual matrices have limits lim Qk =
k

Q? , lim Rk = R? , where S = Q? R? is the Q R factorization of the limit S into a product


k

of orthogonal and positive upper triangular matrices.


3/7/03

372

c 2003

Peter J. Olver

Thus, in our case,


b Q ,
Q
k
?

e R ,
R
k
?

where

S = Q ? R?

is the Q R factorization of the eigenvector matrix. Therefore,


1

k1
b1
b R
e k
e
e R
e1 R R1 = V,
R
U
=R
Rk = R
k1
k
k k1 = Rk U
?
?
k1
b 1 Q
b
Qk = Q
k1 k I ,

and hence

Ak = Qk Rk V = R? R?1 .

Since R? is upper triangular, it is easily seen that the limiting matrix V = R ? R?1 is
also upper triangular, and has the same diagonal entries as , namely the eigenvalues of
A. This completes the proof of the basic Q R algorithm.
If A = AT is symmetric, then the eigenvector matrix S can be chosen to be an
orthogonal matrix, whose columns are the orthonormal eigenvector basis of A, and in
the Spectral Theorem 8.25. In this case, S = Q? and R? = I , and hence the limiting
b Q = S converges to the
matrix V = is the diagonal eigenvalue matrix, while Q
k
?
1
orthogonal eigenvector matrix. In this case, since S = S T , its rows are the (transposed)
eigenvectors. Therefore, the regularity condition can be interpreted as saying that, for
each k = 1, . . . , n, the orthogonal projections of first k eigenvectors u 1 , . . . , uk onto the
subspace spanned by e1 , . . . , ek remain linearly independent. In non-regular cases, a more
sophisticated anaolysis proves that the algorithm still converges, but the eigenvalues do
not necessarily appear in decreasing order of size along the diagonal of V .
Theorem 9.52. If A is symmetric, satisfies (9.100), and S T is a regular matrix,
b S and R in the Q R algorithm converge to, respectively
then the matrices Q
k
k
the eigenvector matrix and the diagonal eigenvalue matrix. The rate of convergence is
governed by the ratio between the subdominant and dominant eigenvalues.
In practical implementations, the straight Q R algorithm takes too many arithmetic
implementations unless A is a tridiagonal matrix. More sophisticated approaches use
Householder matrices to convert A to tridiagonal form before applying the algorithm. See
[101] for details.

3/7/03

373

c 2003

Peter J. Olver

Chapter 10
Boundary Value Problems in One Dimension
In this chapter, we begin our discussion of continuous mechanical systems. The equilibrium equations of one-dimensional continuum mechanics bars, beams, etc. are
formulated as boundary value problems for scalar ordinary differential equations. The basic framework introduced for discrete mechanical systems in Chapter 6 will carry over, in
essence, to the infinite-dimensional context governing such boundary value problems. The
underlying Euclidean vector space R n becomes a suitable function space. Vectors become
functions, while matrices turn into linear differential operators. We shall characterize the
underlying linear boundary value problems as self-adjoint and positive (semi-)definite, with
respect to a suitable inner product on function space. Stable configurations lead to positive
definite boundary value problems whose equilibrium solutions can then be characterized
by a general minimization principle based on a quadratic functional representing the total
energy in the system. Once again, Nature seeks to minimize energy.
Many of the basic linear algebra methods that we have learned in the preceding
chapters can be readily translated into this new situation. They not only provide us with
important insights into the nature of solutions, but also motivate basic solution techniques,
and, ultimately, underly all numerical solution methods. In the infinite-dimensional function space framework underlying these boundary value problems, the general superposition
principle becomes reformulated in terms of the response of the system to a unit impulse
force concentrated at a single point. However, finding a function to represent a unit impulse is a non-trivial issue; it turns out that standard functions will not do the trick, and
we are led to develop a theory and calculus of generalized functions. The most important
generalized function is the delta function which represents the required concentrated unit
impulse. The solution governing the reponse of the system to a unit impulse force is known
as the Greens function. The general solution to the inhomogeneous system can be reconstructed by superimposing the effects of suitably scaled impulse responses at all different
positions. Understanding this construction will become increasingly important when we
progress on to partial differential equations, where direct analytical solution techniques are
far harder to come by.
In simple situations, we are able to develop explicit analytical formulae for the solution.
One should never underestimate the value of explicit formulae for providing insight into the
underlying physical processes and behavior of general systems. However, more complicated
problems require numerical solution techniques, and this forms the subject of the final
section. Numerical solutions to positive definite boundary value problems will be based
on the finite element method, which relies on the characterization of solution through a
minimization principle. The differential equations are converted into a system of linear
3/7/03

374

c 2003

Peter J. Olver

x
u(x)

Figure 10.1.

Bar with One Fixed Support.

algebraic equations by minimizing the restriction of the energy functional to a suitably


chosen finite-dimensional subspace of the full function space. An alternative approach
to the finite element solution, that can be applied even in situations where there is no
minimum principle available, is based on the idea of a weak solution to the boundary value
problem, where one relaxes the classical differentiability requirements.

10.1. Elastic Bars.


A bar is a mathematical idealization of a one-dimensional linearly elastic continuum
that can be stretched or contracted in the longitudinal direction, but is not allowed to bend
in a transverse direction. (Materials that can bend are called beams, and will be analyzed
in Section 10.4.) We will view the bar as the continuum limit of a one-dimensional chain
of masses and springs a system that we already analyzed in Section 6.1. Intuitively,
the continuous bar consists of an infinite number of masses connected by infinitely short
springs. The individual masses can be thought of as the atoms in the bar, although one
should not try to read too much into the physics of this interpretation.
We shall derive the basic equilibrium equations for the bar from first principles. Recall
the three basic steps we already used to establish the corresponding equilibrium equations
for discrete mechanical systems (massspring chains and structures):
(i ) First, use geometry to relate the displacement of the masses to the elongation
in the connecting springs.
(ii ) Second, use the constitutive assumptions such as Hookes Law to relate the
strain to the stress or internal force in the system.
(iii ) Finally, impose a force balance between external and internal forces.
The remarkable fact, which will, when suitably formulated, carry over to the continuum,
is that the force balance law is directly related to the geometrical displacement law by a
transpose or adjoint operation.
3/7/03

375

c 2003

Peter J. Olver

Consider a bar of length ` hanging from a fixed support, with the bottom end left
free, as illustrated in Figure 10.1. We use 0 x ` to refer to the reference or unstressed
configuration of the bar, so x measures the distance along the bar from the fixed end x = 0
to the free end x = `. Note that we are adopting the convention that the positive x axis
points down. Let u(x) denote the displacement of the bar from its reference configuration.
This means that the atom that started at position x has moved to position x + u(x).
With our convention, u(x) > 0 means that the atom has moved down, while if u(x) < 0
the atom has moved up. In particular,
u(0) = 0

(10.1)

because we are assuming that the top end is fixed and cannot move.
The strain in the bar measures the relative amount of stretching or elongation. Two
nearby atoms, at respective positions x and x + x, are moved to positions x + u(x) and
x + x + u(x + x). The original, unstressed length of this small section of bar was x,
while in the new configuration the same section has length

x + x + u(x + x) x + u(x) = x + u(x + x) u(x) .

Therefore, this piece of the bar has been elongated by an amount u(x + x) u(x). The
dimensionless strain measures the relative elongation, and so is obtained by dividing by
the reference length: [ u(x + x) u(x) ]/x. We now take the continuum limit by letting
the two atoms become infinitesimally close. Mathematically, we set y = x + x and let
the interatomic spacing x 0. The result is the strain function
u(x + x) u(x)
du
=
x0
x
dx

v(x) = lim

(10.2)

that measures the local stretch in the bar at position x.


We may approximate the bar by a chain of n masses connected by n springs, and
letting the bottom mass hang free. The mass/spring chain will have total length `, and so
the individual springs have reference length
x =

`
.
n

The bar can be viewed as the continuum limit of such a mass/spring chain, where the
number of masses n and the spring lengths x 0. The k th mass starts out at
position
k`
xk = k x =
,
n
and, under forcing, experiences a displacement uk . The strain or relative elongation of the
k th spring is
u
uk
e
(10.3)
.
vk = k = k+1
x
x
In particular, since the fixed end cannot move, the first value u0 = 0 is omitted from the
subsequent equations.
3/7/03

376

c 2003

Peter J. Olver

Remark : We will find it helpful to label the springs from k = 0 to k = n1 here. This
will facilitate comparisons with the bar, which, by convention, starts at position x 0 = 0.
The relation (10.3) between displacement and strain takes the familiar matrix form

T
T
v = A u,
v = v0 , v1 , . . . , vn1 ,
u = ( u 1 , u2 , . . . , u n ) ,

where

1
1

A=
x

1
1

1
1

1
..
.

..

.
1

d
dx

(10.4)

is the scaled incidence matrix of the mass/spring chain. The derivative operator d/dx that
relates displacement to strain in the bar equation (10.2) can be viewed as the continuum
limit, as the number of masses n and the spring lengths x 0, of the scaled
incidence matrix (10.4). Vice versa, the incidence matrix can be viewed as a discrete,
numerical approximation to the derivative operator. Indeed, if we regard the discrete
displacements and strains as approximations of their continuous counterparts, so
uk u(xk ),

k (xk ),

then (10.3) takes the form


u(xk+1 ) u(xk )
u(xk + x) u(xk )
du
=

(x ).
x
x
dx k
justifying the identification (10.4). The passage back and forth between the discrete and
the continuum forms the foundation of mechanics solids, fluids, gases. Discrete models
both motivate and provide numerical approximations to continuum systems, which in turn
simplify and provide insight into the discrete domain.
The next part of the framework is to use the constitutive relations of the bar to relate
the strain to the stress, or internal force experienced by the bar. To keep matters simple,
we shall only consider bars that are modeled by a linear relation between stress and strain.
For physical bars, this is a pretty good assumption as long as the bar is not stretched
beyond its elastic limits. Let w(x) denote the stress on the part of the bar that was at
reference position x. Hookes Law implies that
v(xk ) =

w(x) = c(x) v(x),

(10.5)

where c(x) measures the stiffness of the bar at position x. For a homogeneous bar, made
out of a uniform material, c(x) c is a constant function. The constitutive function c(x)
is the continuum limit of the diagonal matrix
c

C=

3/7/03

c1

..

cn1
377

c 2003

Peter J. Olver

of individual spring constants ck appearing in the discrete constitutive law


wk = c k vk ,

or

w = C v,

(10.6)

Indeed, writing (10.6) as w(xk ) = c(xk ) v(xk ) makes the identification immediate.
Finally, we need to impose a force balance at each point of the bar. Suppose f (x)
is an external force at position x on the bar, where f (x) > 0 means the force is acting
downwards. Physical examples include (variable) gravitational or electrostatic forces acting
solely in the vertical direction. The bar will deform so as to balance the external force with
its own internal force resulting from stretching. Now, the internal force per unit length
on the section of the bar lying between nearby positions x and x + x is the difference in
stress at the two ends, [ w(x + x) w(x) ]/x. The force balance law requires that, in
the limit,
w(x + x) w(x)
dw
0 = f (x) + lim
= f (x) +
,
x0
x
dx
or
dw
.
(10.7)
f =
dx
The force balance law is the continuum limit of the massspring chain version,
fk =

wk1 wk
,
x

(10.8)

wn = 0,

where the final condition gives the correct formula for the force on the free-hanging bottom
mass. (Remember that the springs are numbered from 0 to n 1.) This indicates that we
should also impose an analogous boundary condition
w(`) = 0

(10.9)

at the end of the bar xn = ` which is hanging freely and so is unable to support any stress.
The matrix form of the discrete system (10.8) is
f = AT w,
where the transposed scaled incidence matrix

1 1

1 1

1 1
1

AT =

1
x

1
..
.

..

d ,

dx

(10.10)

should approximate the differential operator d/dx that appears in the continuum force
balance law (10.7). Thus, we should somehow interpret the differential operator d/dx as
the transpose or adjoint of the differential operator d/dx. This important point will
be developed properly in Section 10.3. But before trying to go any further in the theory,
let us analyze the mathematical equations governing some simple configurations.
3/7/03

378

c 2003

Peter J. Olver

In summary, the three basic bar equations (10.2), (10.5), (10.7) are
v(x) =

du
,
dx

w(x) = c(x) v(x),

f (x) =

dw
.
dx

(10.11)

Substituting the first equation into the second, and then the resulting formula into the last
equation, leads to the equilibrium equation

du
d
c(x)
= f (x),
0 < x < `.
(10.12)
K[ u ] =
dx
dx
Thus, the displacement u(x) of the bar is obtained as the solution to a second order
ordinary differential equation. As such, it will depend on two arbitrary constants, which
will be uniquely determined by the boundary conditions (10.1), (10.9) at the two ends:
w(`) = c(`) u0 (`) = 0.

u(0) = 0,

(10.13)

Usually c(`) > 0, in which case it can be omitted from the second boundary condition,
which simply becomes u0 (`) = 0.
Example 10.1. Consider the simplest case of a uniform bar of unit length ` = 1
subjected to a uniform force, e.g., gravity. The equilibrium equation (10.12) is
c

d2 u
= f,
dx2

(10.14)

where we are asusming that the force f is constant. This elementary second order ordinary
differential equation can be immediately integrated,
u(x) = 21 x2 + a x + b,

where

f
c

(10.15)

is the ratio of the force to the stiffness of the bar. The values of the integration constants
a and b are fixed by the boundary conditions (10.13), so
u0 (1) = + a = 0.

u(0) = b = 0,

Therefore, there is a unique solution to the boundary value problem, yielding the displacement
u(x) = x 12 x2 ,
(10.16)
which is graphed in Figure 10.2. Note that the displacement reaches a maximum at the
free end of the bar, which stretches downwards the farthest. The stronger the force, or
the less stiff the bar, the farther the overall displacement. Also note the parabolic shape
of the displacement graph, with a zero derivative, and hence zero strain, at the free end.

We will sometimes use primes, as in u0 = du/dx, to denote derivatives with respect to x.

3/7/03

379

c 2003

Peter J. Olver

0.8

0.8

0.6

0.6

0.4

0.4

0.2

0.2

0.2

0.4

0.6

0.8

0.2

u(x)
Figure 10.2.

0.4

0.6

0.8

w(x)
Displacement and Stress of Bar with One Fixed End.

Remark : This example illustrates the simplest way to solve boundary value problems.
It is essentially the same as the usual method for solving initial value problems. First, solve
the differential equation by standard methods (if possible). For a second order equation,
the general solution will involve two arbitrary constants. The values of the constants are
found by substituting the general solution into the two boundary conditions. Unlike initial
value problems, the existence and/or uniqueness of the solution to a general boundary value
problem is not always guaranteed, and so one may encounter situations where one cannot
complete the solution; see, for instance, Example 7.32. A more sophisticated approach,
based on the concept of a Greens function, will be discussed in the following section.
As in the discrete situation, this particular mechanical configuration is statically determinate, meaning that we can solve directly for the stress w(x) in terms of the external
force f (x) without having to compute the displacement first. In this particular example,
we need to solve the first order boundary value problem

dw
= f,
dx

w(1) = 0,

arising from the force balance law (10.7), which yields


w(x)
= (1 x).
c
Note that the boundary condition determines the integration constant uniquely. We can
then find the displacement u(x) by solving another boundary value problem
w(x) = f (1 x),

and

du
= v(x) = (1 x),
dx

v(x) =

u(0) = 0,

resulting from (10.2), which again leads to (10.16). As before, the appearance of one
boundary condition implies that we can find a unique solution to the differential equation.
Remark : We motivated the boundary value problem for the bar by taking the continuum limit of the massspring chain. Let us see to what extent this limiting procedure
3/7/03

380

c 2003

Peter J. Olver

can be justified. To compare the solutions, we keep the reference length of the chain fixed
at ` = 1 and its total mass fixed at m. So, if we have n identical masses, each spring has
length x = 1/n. The k th mass will start out at reference position xk = k/n and has mass
mk = m/n. Using static determinacy, we can solve the system (10.8), which reads
wk = wk+1 +
directly for the stresses:
wk = f

f
,
n

k
1
n

wn = 0,

= f (1 xk ) .

Thus, in this particular case, the continuous bar and discrete chain have equal stresses:
w(xk ) = wk . The strains also are in agreement:

1
k
vk = wk = 1
= (1 xk ) = v(xk ) ,
c
n
where = f /c as before. We then obtain the displacements by solving

vk

k
uk+1 = uk +
= uk +
1
.
n
n
n
Since u0 = 0, the solution is

k
k
X
X
k(k + 1)
xk
xk
k
uk =
1 2

= (xk x2k )
= u(xk )
.
k=
2
n i=1
n i=1
n
2n
2n
2n
(10.17)
Now u(xk ) is not exactly equal to uk , but their difference tends to zero as the number of
masses n . In this way, we have completely justified our approximation scheme.
Example 10.2. Consider the same uniform unit length bar as in the previous example, again subject to a uniform constant force, but now with two fixed ends. We impose
the inhomogeneous boundary conditions
u(0) = 0,

u(1) = d,

(10.18)

where the top end is fixed, while the bottom end is displaced an amount d. (Note that
d > 0 means the bar is stretched, while d < 0 means it is compressed.) The general
solution to the equilibrium equation (10.14) is, as before, given by (10.15). The values of
the arbitrary constants a, b are again determined by plugging into the boundary conditions
(10.18), so
u(0) = b = 0,
u(1) = 12 + d = 0.
Again, there is a unique solution to the boundary value problem,
u(x) =

1
2

(x x2 ) + d x.

(10.19)

The displacement is a superposition of two functions; the first constituent is due to the
external force f , while the second is a uniform stretch due to the boundary condition.
3/7/03

381

c 2003

Peter J. Olver

0.5

0.4

0.3

0.2

0.1

0.2

Figure 10.3.

0.4

0.6

0.8

Displacements of a Bar with Fixed Ends.

(As in Example 7.35, linearity of the boundary value problem allows us to combine the
responses to different inhomogeneities.) In Figure 10.3, the dotted curves represent the
two simple responses, and the solid graph is their sum, the actual displacement.
Unlike a bar with a free end, this configuration is statically indeterminate. There is
no boundary condition on the force balance equation
dw
= f,
dx
and so the integration constant a in the stress w(x) = af x cannot be determined without
first figuring out the displacement (10.19):

w(x) = c u0 (x) = f 12 x + d.

Remark : The particular boundary value problems that model the mechanical equilibria of a simple bar arise in many other physical systems. For example, the equation for the
thermal equilibrium of a bar under an external heat source is given by the same boundary
value problem (10.12), in which u(x) represents the temperature of the bar, c(x) represents
the diffusivity or thermal conductivity of the material at position x, while f (x) represents
an external heat source. A fixed boundary condition u(`) = a corresponds to an end that
is held at a fixed temperature a, while a free boundary condition u0 (`) = 0 represents an
insulated end that does not allow heat energy to enter or leave the bar. Details of the
physical derivation can be found in Section 13.1.
Example 10.3. Finally, consider the case when both ends of the bar are left free.
The boundary value problem
u00 = f (x),
3/7/03

u0 (0) = 0,
382

u0 (`) = 0,
c 2003

(10.20)
Peter J. Olver

represents the continuum limit of a massspring chain with two free ends. Based on our
experience, we expect the solution to manifest an underlying instability of the physical
problem. Solving the differential equation, we find

Z x Z y
u(x) = a x + b
f (z) dz dy,
0

where the constants a, b are to be determined by the boundary conditions. Since


Z x
0
f (z) dz,
u (x) = a
0

the first boundary condition u (0) = 0 requires a = 0. The second boundary condition
requires
Z `
0
u (`) =
f (x) dx = 0,
(10.21)
0

which is not automatically valid! The integral represents the total force per unit length
exerted on the bar. As in the case of a mass-spring chain with two free ends, if there is a
non-zero net force, the bar cannot remain in equilibrium, but will move off in space and
the equilibrium boundary value problem (10.20) has no solution. On the other hand, if the
forcing satisfies the constraint (10.21), then the resulting solution of the boundary value
problem has the form

Z Z
x

u(x) = b

f (z) dz dy,

(10.22)

where the constant b is arbitrary. Thus, when it exists, the solution to the boundary value
problem is not unique. The constant b solves the corresponding homogeneous problem,
and represents a rigid translation of the entire bar by a distance b. This should remind
the reader of our study of linear matrix systems, and, indeed, it is another illustration of
the general linear systems Theorem 7.28.

10.2. The Greens Function.


The general superposition principle for inhomogeneous linear systems, as summarized
in Theorem 7.33, inspires a second important approach to the solution of boundary value
problems. This method relies on the solution to a particular set of inhomogeneities, namely
concentrated unit impulses. The resulting family of fundamental solutions are known as the
Greens function for the system, in honor of the self-taught English mathematician (and
miller) George Green. The Greens function has the important property that the solution
induced by any other inhomogeneity can be built up as a continuous superposition of these
fundamental solutions.
To motivate the construction, let us return briefly to the case of a massspring chain.
Given the equilibrium equations
Ku = f,
(10.23)
T

let us decompose the external forcing f = ( f1 , f2 , . . . , fn ) R n into a linear combination


f = f 1 e1 + f2 e2 + + f n en
3/7/03

383

(10.24)
c 2003

Peter J. Olver

of the standard basis vectors of R n . Each ei represents a unit force which is applied solely
to the ith mass in the chain. Formula (10.24) shows how to decompose any other force
vector as a superposition of these individual forces, with fi representing the strength of
the force applied to the ith mass. Suppose we know how to solve each of the individual
systems
K ui = e i ,
i = 1, . . . , n.
(10.25)
The solution ui represents the response of the chain to a single unit force concentrated on
the ith mass. The general superposition principle for linear systems says that we can then
write the solution to the inhomogeneous system (10.23) as a linear combination,
u = f 1 u1 + + f n un

(10.26)

of the individual responses.


Remark : The alert reader will recognize that we are, in fact, reconstructing the solution to the linear system (10.23) by inverting the matrix K. Thus, this observation does
not lead to an efficient solution technique for discrete systems. In contrast, in the case of
continuous boundary value problems, this idea leads to one of the most important solution
paradigms, for both practical and theoretical considerations.
The Delta Function
We now aim to extend this basic superposition principle to the boundary value problem
for an elastic bar. Therefore, the key question is how to characterize a force or impulse
that is concentrated on a single atom of the bar. A unit impulse at position x = y will be
described by something called the delta function, and denoted by y (x). Since the impulse
is supposed to be concentrated solely at x = y, we should have
y (x) = 0

for

x 6= y.

(10.27)

Moreover, since it is a unit impulse, we want the total amount of force exerted on the
bar to be equal to one. The total force is the sum of the individual forces, which, in the
continuum limit, is represented by an integral of the force function f (x) over the length of
the bar. Thus, to represent a unit impulse, we must also require that the delta function
satisfy
Z `
(10.28)
y (x) dx = 1,
provided
0 < y < `.
0

Alas, there is no function that has both of the required properties! At least, not one
that behaves like a function in the usual mathematical sense. Indeed, according to the
basic facts of Riemann (or even Lebesgue) integration, two functions which are the same
everywhere except at one single point have exactly the same integral, [105]. Thus, since
y is zero except at one point, its integral should be 0 not 1. The conclusion is that the
two basic requirements, (10.27), (10.28) are incompatible for ordinary functions!

Here, as before, atom is used in a figurative sense.

3/7/03

384

c 2003

Peter J. Olver

This unfortunate fact stopped mathematicians dead in their tracks. It took a British
engineer, Oliver Heaviside, who was not deterred by the lack of rigorous justification,
to start utilizing delta functions in practical engineering applications with remarkable
effect. Despite his success, Heaviside was ridiculed by the pure mathematicians of his
day, and eventually became mentally unstable. But, some thirty years later, the great
physicist Paul Dirac resurrected the delta function for quantum mechanical applications,
and this finally made theoreticians sit up and take notice. (Indeed, the term Dirac
delta function is quite common.) In 1944, the French mathematician Laurent Schwartz
finally established a rigorous theory of distributions that incorporated such useful, but
very unusual generalized functions, [108]. It is beyond the scope of this introductory text
to develop a fully rigorous theory of distributions. Rather, in the spirit of Heaviside, we
shall concentrate on learning, through practice with applications and computations, how
to domesticate these wild mathematical beasts.
There are two ways to view the delta function. Both are important and worth knowing.
Method #1. Limits: The first approach is to regard the delta function y (x) as a
limit, as n , of a sequence of ordinary smooth functions gn (x). These functions will
represent more and more concentrated unit forces, which, in the limit, converge to the
desired unit impulse concentrated at a single point, x = y. Thus, we require
lim gn (x) = 0,

x 6= y,

(10.29)

while the total amount of force remains fixed at


Z `
gn (x) dx = 1.

(10.30)

On a formal level, the limit function

y (x) = lim gn (x)


n

should satisfy the key properties (10.27), (10.28).


A simple explicit example of such a sequence is provided by the rational functions
n

(10.31)
gn (x) =
1 + n 2 x2

that are graphed in Figure 10.4. These functions satisfy

0,
x 6= 0,
lim gn (x) =
n
,
x = 0,
while

1
1
tan n x
gn (x) dx =
= 1.

x =

(10.32)

We suppress the dependence of the functions gn on the point y where the limiting delta
function is concentrated.

It is slightly simpler here to consider the entire real line corresponding to a bar of infinite
length. See Exercise for the case of a finite interval.

3/7/03

385

c 2003

Peter J. Olver

Figure 10.4.

Delta Function as Limit.

Therefore, formally, we identify the limiting function


lim gn (x) = (x) = 0 (x),

with the unit impulse delta function concentrated at x = 0. As n gets larger and larger,
each function gn (x) is a closer and closer approximation to the delta function, and forms
a more and more concentrated spike, while maintaining a unit total area under its graph.
The limiting delta function looks like an infinitely tall spike of zero width, entirely
concentrated at the origin.
Remark : This construction of the delta function highlights the perils of interchanging
limits and integrals without proper justification. In Riemanns or Lebesgues integration
theories, the limit of the functions gn would be indistinguishable from the zero function
and so the limit of their integrals (10.32) would not equal the integral of their limit:
Z
Z
lim gn (x) dx = 0.
gn (x) dx 6=
1 = lim
n

The delta function is, in a sense, a means of sidestepping this analytic inconvenience.
The full ramifications and theoretical constructions underlying such limits and generalized
functions must, however, be deferred to a rigorous course in real analysis, [105].

Remark : There are many other possible choices for the limiting functions g n (x). See
Exercise for another important example.
Once we have found the delta function (x) = 0 (x) concentrated at the origin, we
can obtain the delta function concentrated at any other position y by a simple translation:
y (x) = (x y).
3/7/03

386

(10.33)
c 2003

Peter J. Olver

Thus, y (x) can be realized as the limit of the translated functions


gbn (x) = gn (x y) =

1+

n
n2 (x

y)2

(10.34)

Method #2. Duality: The second approach is a bit more abstract, but much closer
to the proper rigorous formulation. Here, we view a generalized function like the delta
function as a real-valued linear operator L: C0 [ 0, ` ] R on a suitable function space
in this case the vector space of continuous functions on the interval [ 0, ` ].
The key observation is that if u(x) is any continuous function, then
Z `
y (x) u(x) dx = u(y),
for
0 < y < `.

(10.35)

Indeed, since y (x) = 0 for x 6= y, the integrand only depends on the value of u(x) at the
point x = y, and so
Z `
Z `
Z `
y (x) u(x) dx =
y (x) u(y) dx = u(y)
y (x) dx = u(y).
0

Equation (10.35) serves to define a linear operator Ly : C0 [ 0, ` ] R that maps a continuous function u C0 [ 0, ` ] to its value
Ly [ u ] = u(y) R
at the point x = y. In the dual approach to generalized functions, the delta function is, in
fact, defined as this particular linear operator.
Remark : If y lies outside the integration domain, then
Z `
y (x) u(x) dx = 0,
y<0
or

y > `,

(10.36)

because the impulse occurs outside the interval of integration, and so the integrand is
identically zero on the entire interval. For technical reasons, we will not attempt to define
the integral (10.36) if y = 0 or y = ` lies on the boundary of our interval of integration.
The interpretation of the linear operator Ly as a kind of function y (x) is based on the
identification between vectors and real-valued linear functions. According to Theorem 7.10,
every linear function L: V R on a finite-dimensional inner product space is given by an
inner product L[ u ] = h a ; u i with a fixed element a V . Similarly, on the infinitedimensional function space C0 [ 0, ` ], the L2 inner product
Z `
Lg [ u ] = h g ; u i =
g(x) u(x) dx
(10.37)
0

Linearity was demonstrated in Example 7.7.

3/7/03

387

c 2003

Peter J. Olver

with a fixed function g C0 [ 0, ` ] does define a real valued linear function Lg : C0 [ 0, ` ] R.


However, unlike the finite-dimensional situation, not every real-valued linear function on
function space has this form! In particular, there is no continuous function y (x) such that
the inner product identity
Z `
h y ; u i =
y (x) u(x) dx = u(y)
(10.38)
0

holds for every continuous function u(x). Again, there are profound differences betwen
finite-dimensional and infinite-dimensional vector spaces!
The dual interpretation of generalized functions acts as if this were true. Generalized
functions are real-valued linear operators on function space, which, formally, are identified
as functions via the inner product. One can, with a little care, manipulate generalized
functions as if they were actual functions, but always keeping in mind that a rigorous
justification of such computations must ultimately rely on their formal characterization as
linear operators.
The two approaches limits and duality are completely compatible. Indeed, with
a little extra work, one can justify the dual formula (10.35) as the limit
Z `
Z `
y (x) u(x) dx
(10.39)
gn (x) u(x) dx =
u(y) = lim
n

of the inner products of the function u with the approximating concentrated impulse
functions gn (x). In this manner, the linear operator L[ u ] = u(y) represented by the delta
function is the limit, Ly = lim Ln , of the approximating linear operators
n

Ln [ u ] =

`
0

gn (x) u(x) dx.

Thus, the choice of interpretation of the generalized delta function is, in some ways, a
matter of taste. For the student, the limit interpretation of the delta function is perhaps
the easier to digest at first, although the dual, linear operator interpretation has stronger
connections with the rigorous theory and, even in applications, offers some significant
advantages.
Although on the surface, the delta function might look a little bizarre, its utility in
modern applied mathematics and mathematical physics more than justifies learning to
work with it. Even if neither definition makes complete sense at the moment, the student
is advised to press on and gain a good working relationship with the delta function via its
basic properties. You wont go far wrong by treating it as if it were a genuine function.
After you become more comfortable with the delta function as a practical tool, you can, if
desired, return to contemplate just exactly what kind of object it really is.
Calculus of Generalized Functions
Since we are going to use the delta function to solve differential equations, we need to
find out how it behaves under the basic operations of calculus differentiation and integration. The integral of the delta function is known as a step function. More specifically,
3/7/03

388

c 2003

Peter J. Olver

1.2
1
0.8
0.6
0.4
0.2
-1.5

-1

-0.5

1.5

provided

a < y.

0.5
-0.2

The Step Function.

Figure 10.5.

the basic formulae (10.35), (10.36) imply that

Z x
0,
x < y,
y (t) dt = y (x) = (x y) =
1,
x > y.
a

(10.40)

Figure 10.5 shows the graph of (x) = 0 (x). Unlike the delta function, the step function
y (x) is an ordinary function. It is continuous indeed constant except at x = y. The
value of the step function at x = y is left unspecified, although a popular choice, motivated
by Fourier theory, is to set y (y) = 21 , the average of its left and right hand limits.
We observe that the integration formula (10.40) is compatible with our characterization of the delta function as the limit of highly concentrated forces. If we integrate the
approximating functions (10.31), we obtain
Z x
1
1
fn (x) =
gn (t) dt = tan1 n x + .

Since

lim tan1 y =

1
2

while

lim

tan1 y = 12 ,

these functions converge to the step function:

1,
1
,
lim f (x) = (x) =
n n
2
1,

x > 0,
x = 0,
x < 0.

(10.41)

A graphical illustration of this limiting procedure is sketched in Figure 10.6.


Motivated by the Fundamental Theorem of Calculus, we shall use (10.40) to identify
the derivative of the step function with the delta function
d
= .
(10.42)
dx
This fact is highly significant. In basic calculus, one is not allowed to differentiate a
discontinuous function. Here, we discover that the derivative is defined, not as an ordinary
function, but rather as a generalized delta function.
3/7/03

389

c 2003

Peter J. Olver

Step Function as Limit.

Figure 10.6.

This observation is very general. The definition of a jump discontinuity relies on the
existence of the left and right sided limits at a point y, which are denoted, respectively, by
f (y + ) = lim f (x).

f (y ) = lim f (x),

x y+

x y

The function f (x) is continuous at the point y if and only if its one-sided limits exist
and are equal to its value: f (y) = f (y ) = f (y + ). If the one-sided limits are equal, but
not equal to the value of the function f (y), then the function is said to have a removable
discontinuity, since by redefining f (y) = f (y ) = f (y + ) makes the function continuous at
the point in question. An example is the function f (x) that is equal to 0 for all x 6= 0, but
has f (0) = 1. Removing the discontinuity by setting f (0) = 0 makes f (x) 0 equal to
the continuous constant 0 function. Removable discontinuities play no role in our theory
or applications, and will always be removed if they appear.
Finally, if both the left and right limits exist, but are not equal, then f is said to have
a jump discontinuity at the point y. The magnitude of the jump is the difference
= f (y + ) f (y ) = lim+ f (x) lim f (x),
xy

xy

(10.43)

between the right and left limits. Note the value of the function at the point, f (y), which
may not even be defined, does not play a role in the specification of the jump. The
magnitude of the jump is positive if the function jumps up, when moving from left to
right, and negative for a downwards jump. For example, the step function (x) has a
single unit, i.e., magnitude 1, jump discontinuity at the origin:
(0+ ) (0 ) = 1 0 = 1.
In general, the derivative of a function with jump discontinuities includes a delta
function concentrated at each discontinuity. More specifically, suppose that f (x) is differentiable in the usual calulus sense everywhere except at the point y where it has a jump
discontinuity of magnitude . We can re-express the function in the convenient form
f (x) = g(x) + (x y),

(10.44)

This function is not a version of the delta function its integral is 0, not 1.

3/7/03

390

c 2003

Peter J. Olver

-1

0.5

0.5

-0.5

0.5

1.5

-1

-0.5

-0.5

-0.5

-1

-1

Figure 10.7.

0.5

1.5

The Derivative of a Discontinuous Function.

where g(x) is continuous everywhere, and differentiable except possibly at the jump. Differentiating (10.44), we find
f 0 (x) = g 0 (x) + (x y),

(10.45)

has a delta spike of magnitude at the discontinuity.


Example 10.4. Consider the function

x,
f (x) =
1 2
5x ,

x < 1,
x > 1,

(10.46)

which we graph in Figure 10.7. We note that f has a single jump discontinuity of magnitude
6
5 at x = 1. This means that
f (x) = g(x) +

6
5

(x 1),

where

g(x) =

x,
1
5

x < 1,

6
5,

x > 1,

is continuous everywhere, since its right and left hand limits at the original discontinuity
are equal: g(1+ ) = g(1 ) = 1. Therefore,
0

f (x) = g (x) +

6
5

(x 1),

where

g (x) =

1,

x < 0,

2
5

x > 1.

x,

In Figure 10.7, the delta spike in the derivative of f is symbolized by a vertical line
although this pictorial device fails to indicate its magnitude. Note that we can compute
g 0 (x) by directly differentiating the formula (10.46) for f (x). This implies that, once we
determine the magnitude and location of the jump discontinuities of f (x), we can compute
its derivative directly without finding to the continuous function g(x).
Example 10.5. As a second example, consider the function

x < 0,
x,
x2 1,
0 < x < 1,
f (x) =

x
2e ,
x > 1,

3/7/03

391

c 2003

Peter J. Olver

4
1

2
0.5

-1
-1

-0.5

0.5

1.5

-0.5

0.5

1.5

-0.5

-2

-1

-4

The Derivative of a Discontinuous Function.

Figure 10.8.
1.2

1.2

1
1

0.8
0.8

0.6

0.6

0.4

0.4

0.2
-1.5

-1

-0.5

0.2

0.5

1.5

-1.5

-1

-0.2

Figure 10.9.

-0.5

0.5

1.5

-0.2

First and Second Order Ramp Functions.

which is plotted in Figure 10.8. This function has jump discontinuities of magnitude 1
at x = 0, and of magnitude 2/e at x = 1. Therefore, in light of the preceding remark,

1,
2
0
2 x,
f (x) = (x) + (x 1) +

e
2 ex ,

x < 0,
0 < x < 1,
x > 1,

where the final terms are obtained by directly differentiating f (x).


The integral of the discontinuous step function (10.40) is the continuous ramp function,
Z

x
a

y (z) dz = y (x) = (x y) =

0,
x y,

a < x < y,
x > y > a,

(10.47)

which is graphed in Figure 10.8. Note that (x y) has a corner at x = y, and so is not
d
differentiable there; indeed, its derivative
= has a jump discontinuity, and its second
dx
d2
derivative
= is no longer an ordinary function. We can continue to integrate the
dx2
3/7/03

392

c 2003

Peter J. Olver

delta function; its nth integral is the nth order ramp function

(x y)n
,
x > y,
n (x y) =
n!

0,
x < y.

(10.48)

Example 10.6. The derivative of the absolute value function

x,
x > 0,
a(x) = | x | =
x,
x < 0,

is the sign function

s(x) = a (x) =

+ 1,

x > 0,

1,

x < 0.

(10.49)

Note that there is no delta function in a0 (x) of because a(x) is continuous everywhere.
Since s(x) has a jump of magnitude 2 at the origin and is otherwise constant, its derivative
s0 (x) = a00 (x) = 2 (x) is twice the delta function.
We can also differentiate the delta function. Its first derivative
y0 (x) = 0 (x y)
can be interpreted in two ways. First, we may view 0 (x) as the limit
dgn
2 n3 x
d
= lim
= lim
n (1 + n2 x2 )2
dx n dx

(10.50)

of the derivatives of the approximating functions (10.31). The graphs of these rational
functions take the form of more and more concentrated spiked doublets, as illustrated
in Figure 10.9. To determine the effect of the derivative on a function u(x), we compute
the limiting integral
Z
Z
0
0
h ;ui =
(x) u(x) dx = lim
gn0 (x) u(x) dx
n

Z
Z
0
= lim
gn (x) u (x) dx =
(x) u0 (x) dx = u0 (0).
n

In the middle step, we used an integration by parts; the boundary terms at vanish
provided u(x) is continuously differentiable and bounded as | x | . Pay attention to
the minus sign in the final answer.
In the dual interpretation, the generalized function y0 (x) corresponds to the linear
operator
Z `
0
0
0
where
0 < y < `,
(10.51)
y0 (x) u(x) dx,
Ly [ u ] = u (y) = h y ; u i =
0

that maps a continuously differentiable function u(x) to minus its derivative at the point
y. We note that (10.51) is compatible with a formal integration by parts
`
Z `
Z `

(x y) u(x) dx = (x y) u(x)

(x y) u0 (x) dx = u0 (y).
0

3/7/03

x=0

393

c 2003

Peter J. Olver

Figure 10.10.

Derivative of Delta Function as Limit of Doublets.

The boundary terms at x = 0 and x = ` automatically vanish since (x y) = 0 for x 6= y.


The Greens Function
To further cement our new-found friendship with the delta function, we now discuss
how it is used to solve inhomogeneous boundary value problems. The key idea is to first
solve the system subjected to a unit delta function impulse. The resulting solution is
known as the Greens function. We can then appeal to a general superposition principle
to piece together the solution for a general external force.
Consider a bar of length ` subject to a unit impulse force y (x) = (xy) concentrated
at position 0 < y < ` along the bar. The underlying differential equation (10.12) takes the
form

d
du
c(x)
= (x y),
0 < x < `.
(10.52)

dx
dx
Coupled with the appropriate boundary conditions, this represents the continuum analog
of the unit impulse equilibrium equations (10.25). The solution to the boundary value
problem associated with (10.52) is known as the Greens function, and will be denoted by
Gy (x) = G(x, y).
Example 10.7. Let us look at the simple case of a homogeneous bar with uniform
stiffness c(x) 1, of unit length ` = 1, and fixed at both ends. The boundary value
problem for the Greens function G(x, y) takes the form
u00 = (x y),

(10.53)

u(0) = 0 = u(1),

where 0 < y < 1 indicates the point at which we apply the impulse force. The solution to
the differential equation is obtained immediately by integrating twice. First, by (10.40),
u0 (x) = (x y) + a,
3/7/03

394

c 2003

Peter J. Olver

0.5
0.4
0.3
0.2
0.1

0.2

0.4

0.6

0.8

-0.1
-0.2

Figure 10.11.

Greens function for a Bar with Fixed Ends.

where a is a constant of integration. A second integration leads to


u(x) = (x y) + a x + b,

(10.54)

where is the ramp function (10.47). The integration constants a, b are fixed by the
boundary conditions; since 0 < y < 1, we have
u(0) = b = 0,

u(1) = (1 y) + a + b = 0,

and so

a = 1 y.

Therefore, the Greens function for the problem is


G(x, y) = (x y) + (1 y)x =

x(1 y),
y(1 x),

x y,
x y,

(10.55)

See Figure 10.11 for a graph of G(x, y). Note that it is continuous and piecewise affine
meaning that its graph consists of connected straight line segments with a corner where
the unit impulse force is being applied.
We observe the following fundamental properties that serve to uniquely characterize
the Greens function (10.55). First, since the delta forcing vanishes except at the point
x = y, the Greens function satisfies the homogeneous differential equation
d2 G y
2G
=
(x, y) = 0
for all
x 6= y.
dx2
x2
Secondly, by construction, it must satisfy the boundary conditions,

(10.56)

G(0, y) = 0 = G(1, y).


Thirdly, G is continuous, but has a 90 corner at x = y, which implies that its derivative
G/x has a jump discontinuity of magnitude 1 there. The second derivative 2 G/x2
has a delta function discontinuity at x = y and thereby solves the original boundary value
problem (10.53). Finally, we observe that the Greens function is symmetric in x and y:
G(x, y) = G(y, x).
3/7/03

395

(10.57)
c 2003

Peter J. Olver

This symmetry property is a consequence of the underlying symmetry or self-adjointness


of the boundary value problem; this aspect will be discussed in more depth in the following
section. It has the interesting physical consequence that the response of the bar at position
x due to an impulse force concentrated at position y is exactly the same as the response of
the bar at position y due to an impulse being applied at position x. This turns out to be a
rather general, although perhaps unexpected phenomenon; see Exercise for the discrete
case of a mass-spring chain, and Exercises and for similar statements for circuits and
structures.
Once we have determined the Greens function for the system, we can solve the general
forcing problem
(10.58)
u00 = f (x),
u(0) = 0 = u(1),
by linear superposition in direct analogy with the superposition solution (10.26) of the
discrete problem. In the continuum case, we need to express the forcing function f (x) as
a linear combination of impulses that are concentrated at each point along the bar. Since
there is a continuum of possible positions y at which the impulse forces may be applied,
we need to replace the finite sum by an integral, writing the external force as
Z 1
f (x) =
f (y) (x y) dy.
(10.59)
0

We will interpret (10.59) as the continuous superposition of an infinite collection of impulses, f (y) (x y), of respective magnitudes f (y) and concentrated at position y.
The general linear superposition principle states that linear combinations of inhomogeneities produce linear combinations of solutions. Again, we adapt this principle to the
continuum by replacing the sums by integrals. (Indeed, the original definition of the Riemann integral is as a limit of Riemann sums.) Thus, we write the differential equation
(10.58) as
Z 1
00
u =
f (y) (x y) dy,
0

and write the solution as the same continuous superposition


Z 1
u(x) =
f (y) G(x, y) dy

(10.60)

of the Greens function solutions to the individual unit impulse problems. Plugging (10.55)
into (10.60), and breaking the integral up into two parts, for y < x and y > x, we arrive
at an explicit formula
Z x
Z 1
u(x) =
(1 x) y f (y) dy +
x (1 y) f (y) dy
(10.61)
0

for the solution to the boundary value problem (10.58). For example, under a constant
unit force f (x) 1, the solution (10.61) is
Z x
Z 1
u(x) =
(1 x)y dy +
x(1 y) dy = 21 (1 x) x2 + 12 x (1 x)2 = 12 x 21 x2 ,
0

3/7/03

396

c 2003

Peter J. Olver

in agreement with (10.19) for = 1, d = 0. Although this particular problem was perhaps
easier to solve directly, the use of the Greens function has the advantage of providing
a unified framework that fits all of the special solution techniques for inhomogeneous
boundary value problems, and really comes into its own in higher dimensions.
Let us, finally, convince ourselves that the superposition formula (10.61) does indeed
give the correct answer. First,
Z 1
Z x
du
(1 y) f (y) dy
( y f (y)) dy x (1 x) f (x) +
= (1 x) x f (x) +
dx
x
0
Z 1
Z 1
=
y f (y) dy +
f (y) dy.
0

Differentiating again, we conclude that


d2 u
= f (x),
dx2
as desired. As with all limiting processes, one must always be careful interchanging the
order of differentiation and integration. In all the examples considered here, the integrand
F (x, y) is sufficiently nice to allow this to be done.
Remark : In computing the derivatives of u, we make use of the calculus formula
d
dx

(x)
(x)

d
d
F (x, y) dy = F (x, (x))
F (x, (x))
+
dx
dx

(x)
(x)

F
(x, y) dy
x

(10.62)

for the derivative of an integral with variable limits, which is a straightforward consequence
of the Fundamental Theorem of Calculus and the Chain Rule.
Example 10.8. Consider next a uniform bar of length ` = 1 with one fixed and one
free end. We now need to solve the boundary value problem
c u00 = (x y),

u0 (1) = 0,

u(0) = 0,

(10.63)

where c is the elastic constant of the bar. Integrating twice, we find the general solution
to the differential equation can be written in terms of the ramp function
u(x) =

1
(x y) + a x + b.
c

The integration constants a, b are fixed by the boundary conditions


u0 (1) =

u(0) = b = 0,

1
+ a = 0.
c

Therefore, the Greens function for this problem is

x/c,
x y,
G(x, y) =
y/c,
x y.
3/7/03

397

(10.64)
c 2003

Peter J. Olver

0.8

0.6

0.4

0.2

0.2

0.4

0.6

0.8

y
-0.2

Figure 10.12.

Greens Function for Bar with One Fixed and One Free End.

See Figure 10.12 for a graph of G(x, y).


As in the previous example, the Greens function is piecewise affine, and so solves the
homogeneous differential equation u00 = 0, except at x = y where it has a corner. Its first
derivative G/x has a jump discontinuity of magnitude 1/c, as required for its second
derivative 2 G/x2 to produce the correct delta function singularity. Moreover, it satisfies
the boundary conditions. Note that G(x, y) is constant for x > y since the unit impulse
at x = y will only stretch the part of the bar that lies above it, while the part below
hangs freely. And, as in the first example, the Greens function is a symmetric function:
G(x, y) = G(y, x), which has a similar physical interpretation.
The superposition formula (10.60) becomes

u(x) =

1
0

1
G(x, y)f (y) dy =
c

x
0

1
x f (y) dy +
c

y f (y) dy.

(10.65)

This gives the solution to the boundary value problem


c u00 = f (x),

u(0) = 0,

u0 (1) = 0,

(10.66)

for general external force. The reader may wish to verify this directly, as we did in the
previous example.
Let us conclude this section by summarizing the fundamental properties that are
satisfied by the Greens function for a boundary value problem.
3/7/03

398

c 2003

Peter J. Olver

Basic Properties of the Greens Function


(a) Solves the homogeneous differential equation:

G(x, y) = 0,
c(x)

x
x
(b)
(c)
(d)
(e)

for all

x 6= y.

Satisfies the boundary conditions.


Jump discontinuity of magnitude 1/c(y) in its derivative G/x at x = y.
Symmetry: G(y, x) = G(x, y).
Superposition principle for general forcing function:
Z `
u(x) =
G(x, y) f (y) dy.
0

Although derived and stated for the simple case of a one-dimensional boundary value
problem governing the equilibrium solution of a bar, these properties, suitably adapted,
hold in a very braod range of boundary value problems, cinluding those of higher order
and in more dimensions.
Finally, the Greens function represents the continuum limit of the inverse G = K 1
of the discrete stiffness matrix of a mass-spring chain. For a chain with many masses, the
entries Gij of the inverse matrix are approximations to the sampled values G(x i , xj ) of the
Greens function of the limiting bar. The symmetry property G(xi , xj ) = G(xj , xi ) of the
Greens function is the continuum analog of the symmetry Gij = Gji of the inverse of the
symmetric stiffness matrix. See Exercise for the corresponding physical interpretation in
the discrete case.

10.3. Adjoints and Minimum Principles.


Let us now discuss how the boundary value problems for continuous elastic bars fit into
our general equilibrium framework of positive (semi-)definite linear systems. In Chapter 6,
we learned that the stable equilibrium configurations of discrete mechanical systems can
be characterized as energy minimizers. This fundamental physical principle has a direct
counterpart in the continuum systems that we have begun to consider, and the goal of
this section is to understand how to adapt the finite-dimensional constructions to infinitedimensional function space. In particular, a general construction of minimization principles
not only leads to an important theoretical characterization of the equilibrium solution,
but, through the finite element method, leads to the most important class of numerical
algorithms used to compute them.
Adjoints of Differential Operators
In discrete mechanical systems, the crucial observation was that the matrix appearing
in the force balance law is the transpose of the incidence matrix relating displacements
and elongations or strains. In the continuum limit, the discrete incidence matrix has
turned into a differential operator, and so a crucial step is to understand how to take its
3/7/03

399

c 2003

Peter J. Olver

transpose. The answer to this quandary can be found in Section 7.3. The transpose of
a matrix is a particular instance of the general notion of the adjoint of a linear function,
which relies on the specification of inner products on its domain and target spaces. In
the case of the matrix transpose, the adjoint is prescribed with respect to the standard
dot product on Euclidean space. Thus, the correct interpretation of the transpose of a
differential operator is as the adjoint linear operator with respect to suitable inner products
on function space.
For bars and similar continuous one-dimensional media, the role of the incidence
matrix is played by the derivative v = D[ u ] = du/dx, which defines a linear operator
D: U V from the vector space of possible displacements u(x), denoted by U , to the
vector space of possible strains v(x), denoted by V . In order to compute the adjoint of
the derivative operator, we need to impose inner products on both the displacement space
U and the strain space V . The simplest situation is to adopt the same standard L 2 inner
product
Z b
Z b
(10.67)
hu;u
ei =
u(x) u
e(x) dx,
hh v ; ve ii =
v(x) ve(x) dx,
a

on both vector spaces. These are the continuum analogs of the Euclidean dot product, and,
as we shall see, this will correspond to the case of a homogeneous bar with unit stiffness
c 1. According to the defining adjoint equation (7.48), the adjoint D of the derivative
operator must satisfy the inner product identity
hh D[ u ] ; v ii = h u ; D [ v ] i

for all

u U,

v V.

(10.68)

First, we compute the left hand side:


hh D[ u ] ; v ii =

du
;v
dx

b
a

du
v dx.
dx

(10.69)

On the other hand, the right hand side should equal


h u ; D [ v ] i =

u D [ v ] dx.

(10.70)

Now, in the latter integral, we see u multiplying the result of applying the linear operator
D to v. To identify this integrand with that in the previous integral (10.69), we need to
somehow remove the derivative from u. The secret is integration by parts! It allows us to
rewrite the first integral in the form
Z

b
a

du
v dx = u(b) v(b) u(a) v(a)
dx

dv
dx.
dx

(10.71)

If we ignore the boundary terms u(b) v(b) u(a) v(a) for a moment, then the remaining
integral is equal to an inner product

Z b
Z b
dv
dv
dv

dx =
dx = u ;
.
u
u
dx
dx
dx
a
a
3/7/03

400

c 2003

Peter J. Olver

Comparing with (10.70), we deduce that

du
;v
dx

dv
u;
dx

and so

d
dx

d
.
dx

(10.72)

The final equation confirms our identification (10.4) of the derivative operator as the
continuum limit of the incidence matrix A and its negative as the limit (10.10) of the
transposed (or adjoint) incidence matrix AT = A .
However, the preceding argument is only valid if the boundary terms in (10.71) vanish:
u(b) v(b) u(a) v(a) = 0,

(10.73)

which necessitates imposing suitable boundary conditions on the functions u and v. For
example, in the case of a bar with both ends fixed, then the boundary conditions
u(a) = 0,

u(b) = 0,

(10.74)

will ensure that (10.73) holds, and therefore validate (10.72). The homogeneous boundary
conditions serve to define the vector space
U=

u(x) C1 [ a, b ] u(a) = u(b) = 0

v(x) C1 [ a, b ] v(a) = v(b) = 0

of allowable displacements, consisting of all continuously differentiable functions that vanish on the boundary.
The fixed boundary conditions (10.74) are not the only possible ones that ensure the
vanishing of the boundary terms (10.73). An evident alternative is to require that the
strain v vanish at both endpoints, v(a) = v(b) = 0. This is the case of an unsupported
bar with two free ends, where the displacement at the ends is unspecified, but the strain
vanishes owing to a lack of support. In this case, the strain space
V =

consists of all functions that vanish on the boundary. Since the derivative D: U V
must map a displacement u(x) to an allowable strain v(x), the vector space of allowable
displacements takes the form
U=

u(x) C1 [ a, b ] u0 (a) = u0 (b) = 0 ,

indicating free boundary conditions at both ends. Again, restricting D: U V to these


particular vector spaces ensures that the boundary terms (10.73) vanish, and so (10.72)
holds in this situation too.
Let us list the most important combinations of boundary conditions that will imply
the vanishing of the boundary terms (10.73) and ensure that the desired adjoint equation
D = D is valid. In all cases, the boundary conditions impose restrictions on the
displacement space U and, in cases b) d), the strain space V .
3/7/03

401

c 2003

Peter J. Olver

Self-Adjoint Boundary Conditions for a Bar


a) Both ends fixed:

u(a) = u(b) = 0.

b) One free and one fixed end:


c) Both ends free:

u(a) = 0, u0 (b) = 0
u0 (a) = u0 (b) = 0.

d) Periodic boundary conditions:

u(a) = u(b),

or

u0 (a) = 0, u(b) = 0.

u0 (a) = u0 (b).

A fixed boundary condition u(a) = 0 is commonly referred to as a Dirichlet boundary


condition. A free boundary condition u0 (a) = 0 is known as a Neumann boundary condition. The Dirichlet boundary value problem has both ends fixed, while the Neumann
boundary value problem has both ends free. The intermediate case b) is known as a mixed
boundary value problem. The periodic boundary conditions represent a bar that has its
ends joined together, i.e., a circular elastic ring. It represents the continuum limit of
the periodic massspring chain discussed in Exercise , and forces u(x) to be a periodic
function with period b a.
In summary, for a homogeneous bar with stiffness c(x) 1, the connections between
strain, displacement and external force take the form
f = D [ v ] = v 0 ,

v = D[ u ] = u0 ,

provided we impose suitable boundary conditions. The equilibrium equations are written
in the self-adjoint form
K[ u ] = f,

K = D D = D2 ,

where

(10.75)

along with one of the listed pairs of boundary conditions. Note that
K = (D D) = D (D ) = D D = K,

(10.76)

which proves self-adjointness of the differential operator K = K. (We are merely repeating the proof of Theorem 7.51.) Self-adjointness implies that
h K[ u ] ; u
ei =

`
0

00

u (x) u
e(x) dx =

`
0

u(x) u
e 00 (x) dx = h u ; K[ u
e] i

(10.77)

for all displacements u, u


e U . A direct verification of this formula relies on two integration
by parts, using the Dirichlet boundary conditions to cancel out the ensuing boundary terms.
To deal with inhomogeneous materials in the same framework, we need to modify the
inner products on the underlying function spaces. To this aim, we retain the ordinary L 2
inner product
Z b
(10.78)
u(x) u
e(x) dx,
u, u
e U,
hu;u
ei =
a

The circle is sufficiently large that we can ignore any curvature effects.

3/7/03

402

c 2003

Peter J. Olver

on the vector space of possible displacements, but adopt a weighted inner product
hh v ; ve ii =

b
a

v(x) ve(x) c(x) dx,

(10.79)

v, ve V,

on the space of strain functions. The weight function c(x) > 0 turns out to be the stiffness
function for the bar, and so its positivity corroborates the underlying physical hypotheses.
Let us compute the adjoint of the derivative operator D: U V with respect to these
two inner products (10.78), (10.79). Now we need to compare
hh D[ u ] ; v ii =

b
a

du
v(x) c(x) dx,
dx

h u ; D [ v ] i =

with

u(x) D [ v ] dx.

Integrating the first expression by parts, we find

d(c v)
u
dx,
dx
a
a
a
(10.80)
provided we choose our boundary conditions so that the boundary terms vanish:
Z

du
c v dx = u(b) c(b) v(b) u(a) c(a) v(a)
dx

d(c v)
u
dx =
dx

u(b) c(b) v(b) u(a) c(a) v(a) = 0.


This holds from any of the listed boundary conditions: Dirichlet, Neumann, mixed or
periodic. Therefore, in such cases, the weighted adjoint D of the derivative operator is

d
dx

v=

d(c v)
dv
= c
c0 v.
dx
dx

(10.81)

The self-adjoint combination K = D D is now given by

du
d
c(x)
,
K[ u ] =
dx
dx

(10.82)

which agrees with the differential operator (10.12) for a nonuniform bar.
Minimum Principles
According to the general Definition 7.50, a linear operator K: U U on an inner
product space U is positive definite provided it is (a) self-adjoint, so K = K, and (b)
satisfies the positivity criterion
h K[ u ] ; u i > 0,

for all

0 6= u U.

(10.83)

Self-adjointness of the product operator K = D D was proved in (10.76). Furthermore,


Corollary 7.54 tells us that K is positive definite if and only if D has trivial kernel: ker D =
{0}. Indeed, by the definition of the adjoint,
h K[ u ] ; u i = h D [ D[ u ] ] ; u i = hh D[ u ] ; D[ u ] ii = k D[ u ] k2 0,
3/7/03

403

c 2003

Peter J. Olver

so K is automatically positive semi-definite. Furthermore, h K[ u ] ; u i = k D[ u ] k 2 = 0 if


and only if D[ u ] = 0, and thus the condition ker D = {0} is necessary and sufficient for
the positivity criterion (10.83).
Now, in general, the kernel of the derivative operator D is not trivial, but contains all
constant functions. However, we are viewing D as a linear operator on the vector space U
of allowable displacements, and so the elements of ker D must also be allowable, meaning
that they must satisfy the boundary conditions. Thus, the distinction between positive
definite and only positive semi-definite reduces, in the present situation, to the question
of whether or not there are any nontrivial constant functions that satisfy the prescribed
homogeneous boundary conditions and hence belonging to ker D U .
Clearly, the only constant function that satisfies a homogeneous Dirichlet boundary
conditions is the zero function. Therefore, when restricted to the Dirichlet displacement
space U = {u(0) = u(`) = 0}, the derivative operator has trivial kernel, ker D = {0}. As
a result, the composition K = D D > 0 defines a positive definite linear operator on U .
A similar argument applies to the the mixed boundary value problems. Again, the only
constant function that satisfies the homogeneous boundary conditions is the zero function,
which suffices to prove positive definiteness. As we saw, both the Dirichlet and mixed
boundary value problems admit a unique equilibrium solution under arbitrary external
forcing, and correspond to stable mechanical systems.
On the other hand, all nonzero constant functions satisfy both the Neumann and the
periodic boundary conditions, and so in both cases ker D U consists of all constant
functions. Therefore, ker K 6= {0}, and the resulting boundary value problem is only
positive semi-definite. Indeed, in such unstable configurations, the boundary value problem
has either no solution or infinitely many equilibrium solutions, depending on the nature of
the external forcing. Thus, the distinction between stable and unstable systems based on
the definiteness of the underlying differential operator is in complete correspondence with
the finite-dimensional story in Chapter 6.
In the positive definite, stable cases, we can characterize the solution to the homogeneous boundary value problem K[ u ] = f as the unique minimizer of the quadratic
functional
Z `
1

2
0
2
1
(10.84)
P[ u ] = 2 k D[ u ] k h u ; f i =
2 c(x) u (x) f (x) u(x) dx.
0

A proof of this general fact appears following Theorem 7.53. Note that the norm in (10.84)
refers to the strain space V , and so is associated with the weighted inner product (10.79);
indeed, the first term
Z `
Z `
2
2
2
1
1
1
1
1
2 k D[ u ] k = 2 k v k =
2 c(x) v(x) dx =
2 v(x) w(x) dx = 2 h v ; w i
0

is one half the (unweighted) inner product between stress and strain, and hence represents
the internal energy of our bar. The second term represents the potential energy due to the

The inhomogeneous boundary value problem will be discussed later.

3/7/03

404

c 2003

Peter J. Olver

external forcing, and so, as usual, our minimization principle (10.84) represents the total
energy for the mechanical configuration.
On the other hand, the unstable Neumann and periodic boundary value problems do
not admit a minimization principle. Indeed, a solution may not even exist, or, if it does
exist, is not unique. In neither event can one characterize the solution by a quadratic
minimization procedure.
Example 10.9. Consider the homogeneous Dirichlet boundary value problem
u00 = f,

u(0) = 0,

(10.85)

u(`) = 0.

for a uniform bar with two fixed ends. This is a stable case, and so the differential operator
K = D D = D2 , cf. (10.75), is positive definite. Explicitly, positive definiteness
requires
Z `
Z `
2
00
u0 (x) dx > 0
(10.86)
[ u (x) u(x) ] dx =
h K[ u ] ; u i =
0

for all nonzero u(x) 6 0 satisfying the boundary conditions. Note how we employed an
integration by parts, using the boundary conditions to eliminate the boundary terms, to
expose the positivity of the integral. The minimum principle (10.84) for the boundary
value problem (10.85) can be written out as
Z `

1 0 2
0 2
1
P[ u ] = 2 k u k h u ; f i =
(10.87)
2 u (x) f (x) u(x) dx.
0

In other words, the solution u? (x) to (10.85) is the function for which P[ u? ] achieves the
minimal value over all possible functions u(x) satisfying the boundary conditions.
A direct proof of this fact may be instructive. As in our derivation of the adjoint
operator, it relies on an integration by parts. Since u00? = f , we find
P[ u ] =
=

1
2

0 2

(u ) +

u00? u

1
2 (u

u0? )2

dx =

dx +

u0? (b) u(b)

`
0

0 2
1
2 (u? )

u0? (a) u(a)

0 2
1
2 (u )

u0? u0 dx

(10.88)

dx.

The first integral is always 0, and is actually equal to 0 if and only if u 0 = u0? . Since u and
u? are both assumed to satisfy the boundary conditions, P[ u ] will assume its minimum
value when u = u? .
Inhomogeneous Boundary Conditions
So far, we have restricted our attention to the homogeneous boundary value problem.
Inhomogeneous boundary conditions a little trickier, since the spaces of allowable displacements and allowable strains are no longer vector spaces, and so the abstract theory,
as developed in Chapter 7, will not directly apply.
The simplest way to avoid this difficulty is to appeal to linear superposition in order
to modify the displacement function so as to include the boundary conditions and thereby
3/7/03

405

c 2003

Peter J. Olver

revert to the homogeneous situation. Consider, for example, the inhomogeneous Dirichlet
boundary value problem

d
du
K[ u ] =
c(x)
= f (x),
u(0) = ,
u(`) = .
(10.89)
dx
dx
Choose a function h(x) that satisfies the boundary conditions:
h(0) = ,

h(`) = .

Note that we are not requiring h to satisfy the differential equation, and so one, but by no
means the only, possible choice is the affine function
h(x) = +

x.
`

(10.90)

Since u and h have the same boundary values, their difference


u
e(x) = u(x) h(x)

(10.91)

u
e(0) = u
e(`) = 0.

(10.92)

satisfies the homogeneous Dirichlet boundary conditions

Moreover, by linearity, u
e satisfies the modified equation

K[ u
e ] = K[ u h ] = K[ u ] K[ h ] = f K[ h ] fe,

or, explicitly,

de
u
d
c(x)
= fe(x),

dx
dx

where

For the particular choice (10.90),

d
fe = f +
dx

dh
c(x)
dx

(10.93)

0
c (x).
fe(x) = f (x) +
`

Thus, we have managed to convert the inhomogeneous problem into a homogeneous boundary value problem given in (10.92), (10.93). The solution to the original inhomogeneous
boundary value problem is then reconstructed from the formula
u(x) = u
e(x) + h(x).

(10.94)

We know that the homogeneous Dirichlet boundary value problem is positive definite,
and so we can characterize the solution to (10.92), (10.93) by a minimum principle, namely
as the minimizer to the quadratic functional

3/7/03

P[ u
e] =

1
e0
2k u

k hu
e ; fei =
2

1
2

406

c(x) u
e 0 (x)2 fe(x) u
e(x) dx.

c 2003

(10.95)
Peter J. Olver

Let us rewrite the minimization principle in terms of the original displacement function
u(x). We replace u
e and fe by their formulae (10.91), (10.93); the result is

P[ u
e ] = 12 k u0 h0 k2 h u h ; f K[ h ] i

= 21 k u0 k2 h u ; f i h u0 ; h0 i h u ; K[ h ] i + 12 k h0 k2 h h ; K[ h ] i (10.96)

= P[ u ] h u0 ; h0 i h u ; K[ h ] i + C0 .

In the middle formula, the first pair of terms reproduces the quadratic energy functional
(10.84) for the actual displacement u(x). The last pair depend only on the initial choice
of interpolating function h(x); thus they do not depend upon the choice of u(x) and can
be effectively ignored. The middle pair are new, but can be explicitly evaluated:
0

h u ; h i h u ; K[ h ] i =
=

`
0

c(x) h0 (x) u0 (x) + [ c(x) h0 (x) ]0 u(x) dx

d
c(x) h0 (x) u(x) dx = c(`) h0 (`) u(`) c(0) h0 (0) u(0).
dx

(10.97)

In particular, if u(x) satisfies the inhomogeneous Dirichlet boundary conditions, then these
terms
h u0 ; h0 i h u ; K[ h ] i = c(`) h0 (`) c(0) h0 (0) C1
also depend only on the interpolating function h and not on u. Therefore,
P[ u
e ] = P[ u ] C1 + C0

differ by a constant. Consequently, if the function u


e minimizes P[ u
e ], then u = u
e+h
necessarily minimizes P[ u ]. In this manner, we have characterized the solution to the
inhomogeneous Dirichlet boundary value problem by the same minimization principle.
Theorem 10.10. The solution u? (x) to the Dirichlet boundary value problem
d

dx

du
c(x)
= f (x),
dx

u(0) = ,

u(`) = ,

is the unique function that minimizes the energy functional


P[ u ] =

1
2

c(x) u0 (x)2 f (x) u(x) dx

among all C2 functions that satisfy the indicated boundary conditions.


The mixed inhomogeneous boundary value problem, which still admits a unique solution, is slightly different since the additional terms (10.97) will depend upon the choice of
function. See Exercise for details.
3/7/03

407

c 2003

Peter J. Olver

Figure 10.13.

Bending of a Beam.

10.4. Beams and Splines.


Unlike a bar, which can only stretch in the longitudinal direction, an elastic beam is
only allowed to bend in a transverse direction. To keep the geometry simple, we consider
the case in which the bending of the beam is restricted to the (x, y) plane, as sketched in
Figure 10.13. (See Exercise for a discussion of the three-dimensional case.) Let 0 x `
represent the position along the beam of reference length `. Let y = u(x) represent the
transverse displacement at position x.
The strain in a beam measures how far it is bent. Mathematically, bending is equal
to the curvature of the graph of the displacement function u(x), and is computed by the
usual calculus formula
u00
.
(10.98)
=
(1 + u02 )3/2
Thus, for beams, the strain is a nonlinear function of displacement. We shall suppress the
nonlinearity by assuming that the beam is not bent too far; more specifically, we assume
that the derivative u0 (x) 1 is small and so the tangent line is nearly horizontal. Under
this assumption, the curvature function (10.98) is replaced by its linear approximation
v = L[ u ] =

d2 u
.
dx2

From now on, we will identify v = u00 as the strain in a bending beam. The second
derivative operator L = D 2 that maps displacement to strain thereby assumes the role of
the incidence matrix for the (linearized) beam and describes its underlying geometry.
The next step is to formulate a constitutive relation that relates stress to strain.
Physically, the stress w(x) represents the bending moment of the beam, defined as the
product of internal force and angular deflection. As with a bar, we assume a linear Hookes
law, and so the beam stress function has the form
w(x) = c(x) v(x) = c(x)

d2 u
,
dx2

(10.99)

By definition, the curvature of a curve at a point is equal to the reciprocal, = 1/r of the
radius of the osculating circle; see Exercise for details.

3/7/03

408

c 2003

Peter J. Olver

where the proportionality factor c(x) > 0 measures the stiffness of the beam at the point
x. The beam is uniform if c(x) c is a constant function.
Finally, the differential equation governing the equilibrium configuration of the beam
when subject to an external force will follow from a balance of internal and external forces.
To compute the internal force, we invoke our general equilibrium framework, which tells
us to apply the adjoint of the incidence operator L = D 2 to the stress. Let us compute
the adjoint. We use the ordinary L2 inner product on the space of displacements u(x),
and adopt a weighted inner product, based on the stiffness function c(x), between strain
functions:
Z b
Z b
hu;u
ei =
u(x) u
e(x) dx,
hh v ; ve ii =
v(x) ve(x) c(x) dx.
a

To compute the adjoint L = (D2 ) , we need to compare


hh L[ u ] ; v ii =

L[ u ] v c dx

h u ; L [ v ] i =

with

u L [ v ] dx.

As always, the adjoint computation relies on (in this case two) integration by parts:
hh L[ u ] ; v ii =

`
0

d2 u
c v dx =
dx2

du
cv
dx

du d(c v)
dx
dx
0 dx
x=0
.

`
Z `
d2 (c v)
du
d(c v)
=
u
+
cv u
dx

dx
dx
dx2
0
x=0

Therefore, L [ v ] = D 2 (c v) provided the boundary terms vanish:

d(c v)
du
cv u
dx
dx

x=0

dw
du
wu
dx
dx

(10.100)

= u0 (`) w(`) u(`) w (`) u0 (0) w(0) u(0) w 0 (0) = 0.


x=0
0

Thus, the appropriate force balance equations are


L [ v ] = f,

or, explicitly,

d2 (c v)
d2 w
=
= f (x).
dx2
dx2

(10.101)

A justification of (10.101) based on physical principles can be found in [8, 98]. Combining
(10.99), (10.101), we conclude that the equilibrium configurations of the beam are solutions
to the differential equation

d2
d2 u
c(x) 2 = f (x).
(10.102)
dx2
dx
Since we are dealing with a fourth order ordinary differential equation (10.102), we need
to impose a total of four boundary conditions two at each end so as to make the
boundary terms in our integration by parts computation vanish, (10.100). There are a
3/7/03

409

c 2003

Peter J. Olver

Simply Supported End

Fixed End

Free End

Sliding Clamped End

Figure 10.14.

Boundary Conditions for a Beam.

variety of ways in which this happens, and the most important possibilities are illustrated
in Figure 10.14:
Self-Adjoint Boundary Conditions for a Beam
a) Simply supported end:

u(0) = w(0) = 0.

b) Fixed (cantilevered) end:


c) Free end:

u(0) = u0 (0) = 0.
w(0) = w 0 (0) = 0.

d) Sliding clamped end:

u0 (0) = w0 (0) = 0.

Here w(x) = c(x) v(x) = c(x) u00 (x) is the stress resulting from the displacement u(x).
A second similar pair of boundary conditions must be imposed at the other end x = `.
One can mix or match these conditions in any combination for example, a pair of simply
supported ends, or one free end and one fixed end, and so on. Another option is to consider
a periodic beam (or bendable circular ring), with periodic boundary conditions
u(0) = u(`),

u0 (0) = u0 (`),

w(0) = w(`),

w 0 (0) = w0 (`).

Inhomogeneous boundary conditions are also allowed, and used to model applied displacements or forces at each end of the beam.
For simplicity, let us concentrate on the case of a uniform beam, with c(x) 1, of
length ` = 1. If there is no external forcing, the differential equation (10.102) reduces to
the homogeneous fourth order ordinary differential equation
d4 u
= 0.
dx4

(10.103)

u = a x 3 + b x2 + c x + d

(10.104)

The general solution


is a linear combination of the four basis solutions 1, x, x2 , x3 , and is easily found by direct
integration. Let us solve a couple of representative boundary value problems.
3/7/03

410

c 2003

Peter J. Olver

0.5
0.4
0.3
0.2
0.1

0.2

0.4

0.6

0.8

-0.1

Hermite Cubic Spline.

Figure 10.15.

First, suppose we fix both ends of the beam, imposing the boundary conditions
u(0) = 0,

u0 (0) = ,

u(1) = 0,

u0 (1) = 0,

(10.105)

so that the left hand end is tilted by a (small) angle tan1 . We substitute the solution
formula (10.104) into the boundary conditions (10.105) and solve for
a = ,

b = 2 ,

c = ,

d = 0.

The resulting cubic polynomial solution


u(x) = (x3 2 x2 + x) = x(1 x)2

(10.106)

is known as a Hermite cubic spline, and graphed in Figure 10.15.


As a second example, suppose that we raise the left hand end of the beam without
tilting, which corresponds to the boundary conditions
u(0) = ,

u0 (0) = 0,

u(1) = 0,

u0 (1) = 0.

(10.107)

Then the solution is


u(x) = (1 x)2 (2 x + 1).

(10.108)

If we simultaneously raise and tilt the left hand end, with u(0) = , u 0 (0) = , then we
can use superposition to write the solution as the sum of (10.106) and (10.108).
To analyze a forced beam, we can adapt the Greens function method. Let us treat
the case of fixed homogeneous boundary conditions
u(0) = 0,

u0 (0) = 0,

u(1) = 0,

u0 (1) = 0.

(10.109)

We first solve the unit impulse equations


d4 u
= (x y)
dx4
3/7/03

411

(10.110)
c 2003

Peter J. Olver

0.03

0.02

0.01

0.2

0.4

0.6

0.8

y
-0.01

-0.02

Figure 10.16.

Greens Function for a Beam with Two Fixed Ends.

corresponding to a concentrated unit impulse applied at position y along the beam. Integrating (10.110) four times, using (10.48) with n = 4, we produce the general solution
1
3
x > y,
3
2
6 (x y) ,
u(x) = a x + b x + c x + d +
0
x < y.
to the differential equation (10.110). The boundary conditions require
1
6

(1 y)3 = 0,

u(0) = d = 0,

u(1) = a + b +

u0 (0) = c = 0,

u0 (1) = 3 a + 2 b +

1
2

(1 y)2 = 0,

and hence
a=

1
3

(1 y)3

1
2

(1 y)2 ,

b = 12 (1 y)3 +

Therefore, the Greens function is


(
2
1 2
6 x (1 y) (3 y x 2 x y),
G(x, y) =
2
1 2
6 y (1 x) (3 x y 2 x y),

1
2

(1 y)2 .

x < y,
x > y.

(10.111)

The Greens function is symmetric in x and y, so G(x, y) = G(y, x), which reflects the fact
that we are dealing with a self-adjoint system. Physically, this means that the deflection of
the beam at position x due to a unit impulse force concentrated at position x is the same as
the defelection at y due to an impulse force at x. Moreover, as a function of x, the Greens
function satisfies the homogeneous differential equation for all x 6= y. Here, G(x, y) has
two continuous derivatives; the third derivative 3 G/x3 has a unit jump discontinuity at
x = y, which then produces the delta function impulse in its fourth derivative. The Greens
function (10.111) is graphed in Figure 10.16, and appears to be quite smooth. Evidently,
the human eye cannot easily discern discontinuities in third order derivatives!
The general solution to the forced boundary value problem
d4 u
= f (x),
dx4
3/7/03

u(0) = u0 (0) = u(1) = u0 (1) = 0,


412

c 2003

(10.112)
Peter J. Olver

0.003
0.002
0.001
0.2

0.4

0.6

0.8

-0.001
-0.002
-0.003

Figure 10.17.

Deflection of a Uniform Beam under Gravity.

is given by a superposition formula. We view the forcing function as a linear superposition


Z `
f (y) (x y) dx
f (x) =
0

of impulse delta forces. By linearity, the response is the same linear superposition of
Greens function responses:
Z 1
u(x) =
G(x, y)f (y) dy
0

1
=
6

1
y (1 x) (3 x y 2 x y) dy +
6
2

x2 (1 y)2 (3 y x 2 x y)f (y) dy.


x

For example, under a constant unit downwards force f (x) 1, e.g., gravity, the deflection
of the beam is given by
u(x) =

1
24

x4

1
12

x3 +

1
24

x2 =

1
24

x2 (1 x)2 ,

and graphed in Figure 10.17. Although we could, of course, obtain u by integrating


the original differential equation (10.112) directly, having the general formula as a single
integral is more useful, particularly for numerical computations.
Since the beam operator has the standard self-adjoint form K = L L, it will be
positive definite when subject to the appropriate boundary conditions. The key condition
is that ker L = ker D 2 = {0} on the space of functions satisfying the boundary conditions.
Since the second derivative D 2 annihilates all affine functions u = + x, the boundary
value problem will be positive definite if and only if no non-zero affine function satisfies
all four boundary conditions. For example, having one fixed end will suffice, while two
free ends, or a simply supported plus a free end will not. In the former case, every affine
function satisfies the boundary conditions, while in the latter u(x) = x satisfies the four
boundary conditions u(0) = u0 (0) = 0, w(`) = w 0 (`) = 0.

We have reversed the vertical axis in accordance with our convention that positive deflections
go down.

3/7/03

413

c 2003

Peter J. Olver

In the positive definite cases, the solution to the beam boundary value problem can
be characterized as the unique minimizer of the quadratic energy functional
P[ u ] =

1
2

k L[ u ] k h u ; f i =

1
2

c(x) u00 (x)2 f (x) u(x) dx.

(10.113)

(Recall that the norm on the strain functions v = L[ u ] = u00 corresponds to the weighted
inner product.) Minimizing P among all functions with homogeneous boundary conditions will lead to the solution to the beam equation (10.102). Inhomogeneous boundary
conditions require a little extra work, keeping careful track of the integration by parts
required.
Splines
In preCAD draftsmanship, a spline was a long, thin, flexible strip of wood that was
used to draw a smooth curve connecting prescribed points. The points were marked by
small pegs, and the spline rested on the pegs. Mathematically, suppose that the spline
coincides with the graph of a function y = u(x). The pegs are fixed at the prescribed
data points (xj , yj ) for j = 0, . . . , n, and this requires u(x) to satisfy the interpolation
conditions
u(xj ) = yj ,
j = 0, . . . , n.
(10.114)
The mesh points x0 < x1 < x2 < < xn are distinct, and labeled in order.
On the intervals between each successive pair of mesh points, the spline is modeled as
an elastic beam, and so satisfies the homogeneous beam equation (10.103). Therefore,
u(x) = aj + bj (x xj ) + cj (x xj )2 + dj (x xj )3 ,

xj x xj+1 ,
j = 0, . . . , n 1,

(10.115)

is a piecewise cubic function meaning that between successive mesh points it is a cubic
polynomial, but not necessarily the same cubic on each subinterval. The fact that we write
the formula (10.115) in terms of x xj is merely for computational convenience.
Our problem is to determine the coefficients
aj ,

bj ,

cj ,

dj ,

j = 0, . . . , n 1.

Since there are n subintervals between mesh points, there are a total of 4 n coefficients, and
so we require 4 n equations to prescribe them uniquely. First, we need the spline to satisfy
the interpolation conditions (10.114). Since the spline is given by a different formula on
each side of the mesh point, this results in a total of 2 n conditions:
u(x+
j ) = a j = yj ,
3
2
u(x
j+1 ) = aj + bj hj + cj hj + dj hj = yj+1 ,

j = 0, . . . , n 1,

(10.116)

where we abbreviate the length of the j th subinterval by


hj = xj+1 xj .
3/7/03

414

c 2003

Peter J. Olver

The next step is to require that the spline be as smooth as possible. The interpolation
conditions (10.116) guarantee that u(x) is continuous. To make u(x) C 1 continuously
differentiable at the interior mesh points x1 , . . . , xn1 , we require that u0 (x) be continuous
at each xj+1 , which imposes the n 1 additional conditions
0 +
bj + 2 cj hj + 3 dj h2j = u0 (x
j+1 ) = u (xj+1 ) = bj+1 ,

j = 0, . . . , n 2.

(10.117)

To make u C2 , we impose n 1 further conditions


00 +
2 cj + 6 dj hj = u00 (x
j+1 ) = u (xj+1 ) = 2 cj+1 ,

j = 0, . . . , n 2,

(10.118)

to ensure that u00 is continuous at the mesh points. We have now imposed a total of
4 n 2 conditions, namely (10.116), (10.117), and (10.118), on the 4 n coefficients. The
two missing conditions will be imposed as boundary conditions at the two endpoints of the
interval, namely x0 and xn . There are three common boundary conditions:
(i ) Natural (free) boundary conditions:
u00 (x0 ) = u00 (xn ) = 0,

so that

c0 = 0,

cn1 + 3 dn1 hn1 = 0.

(10.119)

Physically, this corresponds to the spline resting freely on the first and last pegs.
(ii ) Clamped boundary conditions:
u0 (x0 ) = ,

u0 (xn ) = ,

where , are prescribed values. This requires


b0 = ,

bn1 + 2 cn1 hn1 + 3 dn1 h2n1 = .

(10.120)

Physically, this corresponds to clamping the spline so as to have prescribed slopes at the
ends.
(iii ) Periodic boundary conditions:
u0 (x0 ) = u0 (xn ),

u00 (x0 ) = u00 (xn ),

so that
b0 = bn1 + 2 cn1 hn1 + 3 dn1 h2n1 ,

c0 = cn1 + 3 dn1 hn1 .

(10.121)

This case is appropriate for periodic functions; a particularly important application is to


computerized sketching of smooth closed curves.
Theorem 10.11. Given data points (xj , yj ) with a = x0 < x1 < < xn = b, there
exists a unique piecewise cubic spline function u(x) C2 [ a, b ] such that u(xj ) = yj and u
satisfies one of the three possible boundary conditions (10.119), (10.120), or (10.121).
Proof : We discuss the natural case. The clamped case is left as an exercise for the
reader, while the slightly harder periodic case will be done at the end of the section. First,
(10.116) says that
aj = y j ,
j = 0, . . . , n 1.
(10.122)
3/7/03

415

c 2003

Peter J. Olver

Second, (10.118), (10.119) imply that


dj =

cj+1 cj
.
3 hj

(10.123)

This equation holds for j = n 1 provided we make the convention that


cn = 0.
Substituting (10.123) into (10.117),
bj+1 = bj + (cj + cj+1 ) hj .

(10.124)

We now substitute (10.122), (10.123) into (10.116), and then solve the resulting equation
for
yj+1 yj
(2 cj + cj+1 ) hj
bj =

.
(10.125)
hj
3
Substituting this result back into (10.124), and simplifying, we find
#
"
yj+2 yj+1
yj+1 yj

zj+1 , (10.126)
hj cj + 2 (hj + hj+1 ) cj+1 + hj+1 cj+2 = 3
hj+1
hj
where we use zj+1 to denote the right hand side of these equations.
For the natural boundary conditions, we have
c0 = 0,
cn = 0,
T

T
and so, setting c = c1 , c2 , . . . , cn1 , z = z1 , z2 , . . . , zn1 , (10.126) constitutes a
tridiagonal linear system
A c = z,
(10.127)

for the unknown coefficients c1 , . . . , cn1 , with coefficient matrix


2 (h + h )
h
0

A=

h1

2 (h1 + h2 )
h2
h2
2 (h2 + h3 )
..
.

..

h3

..
.
.
hn3
2 (hn3 + hn2 )
hn2
hn2
2 (hn2 + hn1 )
(10.128)
Once we solve (10.128), we then use (10.122), (10.125), (10.123) to reconstruct the other
coefficients aj , bj , dj .
The key observation is that the coefficient matrix A is strictly diagonally dominant
because all the hj > 0, and so
2 (hj1 + hj ) > hj1 + hj .
Proposition 9.32 implies that A is nonsingular, and hence the tridiagonal linear system has
a unique solution c. This suffices to prove the theorem in the case of natural boundary
conditions.
Q.E.D.
3/7/03

416

c 2003

Peter J. Olver

2
1.5
1
0.5
1

-0.5
-1

Figure 10.18.

A Cubic Spline.

To actually solve the system and compute the resulting spline function, we can apply
our tridiagonal solution algorithm (1.68). Let us consider the most important case, when
the mesh points are equally spaced in the interval [ a, b ], so that
xj = a + j h,

where

h = hj =

ba
,
n

j = 0, . . . , n 1.

e is equal to h times the tridiagonal matrix


In this case, the coefficient matrix A = h A

4
1

e=
A

1
4 1
1 4 1
1 4 1
1 4 1
.. .. ..
. . .

that first appeared in Example 1.34. The L U factorization takes on a particularly simple
form in this case, since all the entries are essentially equal as soon as j 5 or so (depending
on how much accuracy is desired). This makes the implementation of the forward and back
substitution algorithms particularly easy.
In summary, the construction of the natural spline proceeds as follows: First, determine the coefficients c0 , . . . , cn by solving the tridiagonal linear system (10.127) to
construct c1 , . . . , cn1 and the boundary conditions to determine c0 = cn = 0. Then
use equations (10.122), (10.125), (10.123) to construct the other coefficients a 0 , . . . , an1 ,
b0 , . . . , bn1 , d0 , . . . , dn1 . The resulting piecewise cubic spline (10.115) will be the unique
natural spline interpolant to the data u(xj ) = yj for j = 0, . . . , n.
Figure 10.18 shows a particular example a natural spline passing through the data
points (0, 0), (1, 2), (2, 1), (3, 1), (4, 0). As with the Greens function for the beam, the
human eye is unable to discern the discontinuities in the third derivatives of the spline,
and so the graph appears completely smooth even though it is, in fact, only C 2 .
3/7/03

417

c 2003

Peter J. Olver

1.2
1
0.8
0.6
0.4
0.2
1

-0.2

The B spline 3 (x).

Figure 10.19.
In the periodic case, we set
an = a 0 ,

an+1 = a1 ,

etc.

and similarly for the other coefficients. The basic systems (10.122), (10.125), (10.123) and
(10.126) are the same. Now the coefficient matrix for the linear system
A c = z,

with

c = c0 , c1 , . . . , cn1 ,

z = z0 , z1 , . . . , zn1

is no longer tridiagonal, but of circulant tridiagonal type:


2 (h

h0
hn1
n1 + h0 )
h0
2 (h0 + h1 )
h1

h
2
(h
+
h
)
h

1
1
2
2

.
A=
..
..
..

.
.
.

hn3
2 (hn3 + hn2 )
hn2
hn1
hn2
2 (hn2 + hn1 )
(10.129)
Again A is strictly diagonally dominant, and so there is a unique solution, proving Theorem 10.11 in the periodic case. The L U factorization of such tridiagonal circulant
matrices was discussed in Exercise .
Remark : Once we fix the mesh points xk , each spline is uniquely determined by the
data points y0 , . . . , yn1 , and hence the space of periodic splines forms an n-dimensional
vector space. In analogy with construction of the Lagrange interpolating polynomials,
(4.47), we can construct a convenient basis consisting of the Bsplines 0 (x), . . . , n1 (x)
taking the values

1,
k = j,
(10.130)
j (xk ) =
0,
k 6= j.
See Figure 10.19 for an illustration of 3 (x) for the interpolation points xk = k, where
k = 0, . . . , 7. Once the B spline basis has been computed, the spline s(x) that interpolates a
3/7/03

418

c 2003

Peter J. Olver

Figure 10.20.

Three Sample Spline Letters.

given set of data points y0 , . . . , yn1 is then immediately expressed as a linear combination
s(x) = y0 0 (x) + + yn1 n1 (x)

(10.131)

of the basis periodic Bsplines. A similar basis can be introduced in the natural case. The
clamped boundary conditions require the constants = = 0 in order that the space of
splines forms a vector space.
One immediate application of splines is curve fitting in computer aided design (CAD)
and computer graphics. The basic problem is to draw a smooth curve x(t) = (x(t), y(t))
that passes through a set of prescribed data points xk = (xk , yk ) in the plane. We have
the freedom to choose the parameter values t = tk when the curve passes through the k th
point. The simplest and most common choice is to set tk = k. We then compute the
functions x(t) and y(t) as cubic splines interpolating the x and y coordinates of the points:
x(k) = xk , y(k) = yk . If the curve is closed, then we require that both splines be periodic.
Besides implementations in many computer graphics packages, this idea also underlies
modern font design for laser printing and typesetting (including this book). The great
advantage of spline fonts over their bitmapped counterparts is that they can be readily
scaled to arbitrary sizes. Some sample letter shapes parametrized by periodic splines
passing through the indicated data points are plotted in Figure 10.20. Better results can
be easily obtained by increasing the number of prescribed data points used to fix the
interpolating spline.

10.5. SturmLiouville Boundary Value Problems.


The system governing the equilibrium configurations of a bar is a particular case of a
very general class of second order boundary value problems that arise in many applications,
particularly in the analysis of partial differential equations by the method of separation of
variables. The problems were first investigated by the nineteenth century French mathematicians Jacques Sturm and Joseph Liouville. SturmLiouville boundary value problems
appear in a very wide range of applications, such as
(a) Quantum mechanics the one-dimensional Schrodinger equation.
(b) Scattering theory Hills equation.
(c) Oscillations of circular membranes (vibrations of drums) Bessels equation.
(d) Oscillations of a sphere Legendres equation,
3/7/03

419

c 2003

Peter J. Olver

and many others. The most interesting cases will be left to our analysis of the partial
differential equations in Chapters 16 and 17.
The general SturmLiouville boundary value problem has the form

du
d

p(x)
+ q(x) u = f (x),
(10.132)
dx
dx
along with Dirichlet, Neumann, mixed or periodic boundary conditions. To be specific, let
us concentrate on the case of homogeneous Dirichlet boundary conditions
u(a) = 0,

u(b) = 0.

(10.133)

To avoid singular points of the differential equation (although we will later discover
that most cases of interest in physics have one or more singular points) we assume that
p(x) > 0 for all a x b. To ensure positive definiteness of the SturmLiouville differential operator, we also assume q(x) 0. These assumptions suffice to guarantee existence
and uniqueness of the solution to the SturmLiouville problem; see [20] for a proof of the
following theorem.
Theorem 10.12. If p(x) > 0 and q(x) 0, then the SturmLiouville boundary
value problem (10.132), (10.133) has a unique solution.
SturmLiouville boundary value problems can be placed into our general self-adjoint
framework as follows. Consider the linear operator
0
u
L[ u ] =
u
that maps u(x) to the vector-valued function whose components are the function and its
first derivative. In view of the boundary conditions (10.133), the domain of L will be the
vector space
U = { u(x) | u(a) = u(b) = 0 } C2 [ a, b ]
consisting of all twice continuously differentiable functions that vanish at the endpoints.
The target space of L consists of continuously differentiable vector-valued functions v(x) =
T
( v1 (x), v2 (x) ) ; we denote this vector space as V = C1 ([ a, b ], R 2 ).
We need to compute the adjoint of L: U V . To recover the SturmLiouville problem,
we use the standard L2 inner product (10.78) on U , but adopt a weighted inner product

v1
,
v2

w1
hh v ; w ii =
p(x) v1 (x) w1 (x) + q(x) v2 (x) w2 (x) dx,
v=
w=
,
w2
a
(10.134)
on V . The positivity assumptions on the weight functions p, q ensure that this is a bona
fide inner product. As usual, the adjoint L : V U is required to satisfy (7.48), which is
Z

hh L[ u ] ; v ii = h u ; L [ v ] i.
3/7/03

420

c 2003

Peter J. Olver

As always, the adjoint computation relies an integration by parts:


Z b

0
hh L[ u ] ; v ii =
p u v1 + q u v2 dx
a

= p(b) u(b) v1 (b) p(a) u(a) v1 (a) +

u [ (p v1 )0 + q v2 ] dx.

The boundary terms p(b) u(b) v1 (b) p(a) u(a) v1 (a) = 0 vanish owing to the Dirichlet
conditions (10.133). Therefore,
Z b
hh L[ u ] ; v ii =
u [ (p v1 )0 + q v2 ] dx = h u ; L [ v ] i,
a

and we conclude that the adjoint operator is given by


L [ v ] =

d(p v1 )
+ q v2 .
dx

The usual self-adjoint combination

d
du
du/dx

=
K[ u ] = L L[ u ] = L
p
+ q u.
u
dx
dx

(10.135)

reproduces the SturmLiouville differential operator. Moreover, since ker L = {0} is trivial,
the boundary value problem is positive definite. According to Corollary 7.54, the solution
to the boundary value problem is characterized as the minimizer of the quadratic functional
Z b
1

2
0
2
2
1
1
P[ u ] = 2 k L[ u ] k h u ; f i =
p(x)
u
(x)
+
q(x)
u(x)

f
(x)
u(x)
dx. (10.136)
2
2
a

Note that the norm in this formula refers to the target space V , which has the weighted
inner product (10.134), while the inner product refers to the domain space U .

Example 10.13. Consider the constant coefficient SturmLiouville boundary value


problem
(10.137)
u00 + 2 u = f (x),
u(0) = u(1) = 0.
The functions p(x) 1 and q(x) 2 > 0 are both constant. The solutions can be
characterized as minimizing the quadratic functional
Z 1

1 0 2 1 2
2
(10.138)
P[ u ] =
2 u (x) + 2 u(x) f (x) u(x) dx.
0

Let us solve this problem by constructing the Greens function. Thus, we consider a
delta function inhomogeneity
u00 + 2 u = (x y),

u(0) = u(1) = 0.

(10.139)

Rather than try to integrate this differential equation directly, let us appeal to the defining
properties of the Greens function. The general solution to the homogeneous equation is a
3/7/03

421

c 2003

Peter J. Olver

linear combination of the two basic exponentials e x and e x , or better, the hyperbolic
functions
cosh x =

e x + e x
,
2

sinh x =

e x e x
.
2

(10.140)

The solutions satisfying the first boundary condition are multiples of sinh x, while the
solutions satisfying the second boundary condition are multiples of sinh (1x). Therefore,
the solution to (10.139) has the form
G(x, y) =

a sinh x,
b sinh (1 x),

x < y,
x > y.

(10.141)

Continuity of G(x, y) at x = y requires


a sinh y = b sinh (1 y).

(10.142)

Secondly, the derivative G/x must have a jump discontinuity of magnitude 1 at x = y


in order that the second derivative term in (10.139) match the delta function. Since
G
(x, y) =
x

a cosh x,

x < y,

b cosh (1 x),

x > y,

the jump condition requires


a cosh y 1 = b cosh (1 y).
If we multiply (10.142) by cosh (1y) and (10.143) by
the result is

(10.143)

sinh y
and add the two together,

sinh (1 y)
= a sinh y cosh (1 y) + cosh y sinh (1 y) = a sinh ,

where we used the addition formula

sinh(s + t) = sinh s cosh t + cosh s sinh t

(10.144)

for the hyperbolic sine, cf. Exercise . Therefore,


a=

sinh (1 y)
,
sinh

b=

sinh y
,
sinh

and the Greens function for our boundary value problem is

3/7/03

sinh (1 y) sinh x

sinh

G(x, y) =

sinh (1 x) sinh y ,
sinh
422

x < y,
x > y.
c 2003

Peter J. Olver

0.15

0.1

0.05

0.2

0.4

0.6

0.8

y
-0.05

Figure 10.21.

Greens Function for the Constant Coefficient SturmLiouville


Problem.

A graph appears in Figure 10.21; note the corner appearing where the impulse force is
applied.
The general solution to the inhomogeneous boundary value problem (10.137) is given
by the usual superposition formula (10.60), which becomes
u(x) =
=

G(x, y)f (y) dx


0
x
0

sinh (1 y) sinh x
f (y) dy +
sinh

1
x

sinh (1 x) sinh y
f (y) dy.
sinh

For example, under a constant unit force f (x) 1, the solution is


Z

Z 1
sinh
sinh (1 y) sinh x
u(x) =
dy +
dy
sinh
0 sinh (1 x) sinh y
x

sinh (1 x) cosh x 1
sinh x cosh (1 x) 1
=
+
2 sinh
2 sinh
1
sinh x + sinh (1 x)
= 2
.

2 sinh
x

(10.145)

For comparative purposes, the reader may wish to rederive this particular solution by the
direct method, without appealing to the Greens function.

10.6. Finite Elements.


The characterization of the solution to a positive definite boundary value problem
via a minimization principle inspires a very powerful and widely applicable numerical
method of solution, known as the finite element method . In this section, we give a brief
introduction to the finite element method in the context of ordinary differential equations.
3/7/03

423

c 2003

Peter J. Olver

Adaptations to boundary value problems arising from partial differential equations will
appear in Section 14.5.
The underlying idea is quite simple. We are trying to find the solution to a boundary
value problem by minimizing a quadratic functional P[ u ] on an infinite-dimensional vector
space U . The solution u? (x) U to this minimization problem requires the solution to
a differential equation with boundary conditions. However, as we learned in Chapter 4,
if we were to minimize the functional on a finite-dimensional subspace W U , then this
becomes a problem in linear algebra, and, moreover, one that we have already solved!
Of course, restricting the functional P[ u ] to the subspace W will not, barring luck, lead
to the true minimizer. Nevertheless, if we choose W to be sufficiently large subspace,
the resulting minimizer w? W may very well provide a reasonable approximation to the
true minimizer u? U . The analysis of the finite element method, cf. [FE], provides a
rigorous justification of this process Here, we concentrate on understanding how to apply
the method in practice.
To be a bit more explicit, we consider the basic minimization principle
P[ u ] =

1
2

k L[ u ] k h f ; u i,

(10.146)

for the boundary value problem


K[ u ] = f,

where

K = L L.

As we have learned, the norm in (10.146) is typically based on some form of weighted inner
product hh ; ii on the space of strains v = L[ u ] V , while the inner product term h f ; u i
is typically (although not necessarily) unweighted on the space of displacements u U .
The linear operator takes the self-adjoint form K = L L, and must be positive definite
which requires ker L = {0}. Without the positivity assumption, the boundary value
problem has either no solutions, or infinitely many; in either event, the basic finite element
method will not work.
Rather than try to minimize P[ u ] on the entire function space U , we now seek to
minimize it on a suitably chosen finite-dimensional subspace W U , the elements of
which are required to satisfy the boundary conditions. We begin by choosing a basis
1 , . . . , n of our finite-dimensional subspace W . The general element of W is a linear
combination
w(x) = c1 1 (x) + + cn n (x)
(10.147)
of the basis functions. Our goal, then, is to determine the coefficients c 1 , . . . , cn such that
w(x) minimizes P[ w ] among all such functions. Substituting (10.147) into (10.146) and
expanding we find
n
n
X
1 X
m c c
P[ w ] =
b c =
2 i,j = 1 ij i j i = 1 i i

1
2

cT M c cT b,

(10.148)

where

In this case, an orthonormal basis is not of any particular help.

3/7/03

424

c 2003

Peter J. Olver

(i )
(ii )

c = ( c1 , c2 , . . . , cn ) is the vector of unknown coefficients in (10.147),


M = (mij ) is the symmetric n n matrix with entries
mij = hh L[ i ] ; L[ j ] ii,

(iii )

i, j = 1, . . . , n,

(10.149)

b = ( b1 , b2 , . . . , bn ) is the vector with entries


bi = h f ; i i,

i = 1, . . . , n.

(10.150)

Note that once we specify the basis functions i , the coefficients mij and bi are all known
quantities. Therefore, we have reduced our original problem to a finite-dimensional problem of minimizing a quadratic function (10.148) over all possible vectors c R n . The
coefficient matrix M is, in fact, positive definite, since, by the preceding computation,
T

c Mc =

n
X

mij ci cj = k L[ w ] k > 0

(10.151)

i,j = 1

as long as L[ w ] 6= 0. Our positivity assumption implies that L[ w ] = 0 if and only if


w 0, and hence (10.151) is positive for all c 6= 0. We can now invoke the original finitedimensional minimization Theorem 4.2 to conclude that the unique minimizer to (10.148)
of the form (10.147) is obtained by solving the associated linear system
Mc = b

(10.152)

by Gaussian elimination, or, alternatively, by an iterative linear system solver.


The main issue, then, is how to effectively choose the finite-dimensional subspace W .
We already know a few potential candidates. One is the space P (n) of polynomials of
degree n. Another is the space T (n) of trigonometric polynomials of degree n; this
will be the focus of Chapter 11. However, neither of these is particularly suitable in the
present situation for a variety of reasons. One criterion is that the functions in W must
satisfy the relevant boundary conditions. More importantly, in order to obtain sufficient
accuracy, the resulting linear system (10.152) will typically be rather large, and so the
coefficient matrix (10.149) should be as sparse as possible, i.e., have lots of zero entries.
Otherwise, computing the solution will prove to be too time consuming. This consideration
will be of critical importance when applying the method to partial differential equations
in several variables.
The really innovative contribution of the finite element method is to first (paradoxically) enlarge the space of allowable functions upon which to minimize P[ u ]. The governing differential equation requires its solutions to have a certain degree of smoothness,
whereas the associated minimization principles typically requires only half as many derivatives. Thus, for second order boundary value problems, including bars and general Sturm
Liouville problems, the quadratic functional only involves first order derivatives. It can be
rigorously shown that the functional has the same minimizing solution, even if one allows
functions that do not have enough derivatives to satisfy the differential equation. Thus,
one can try minimizing over subspaces containing fairly rough functions. Again, the
justification of this method requires some deep analysis of the problem, which is beyond
the scope of this introductory treatment.
3/7/03

425

c 2003

Peter J. Olver

0.8
0.6
0.4
0.2
0.4

0.2

Figure 10.22.

0.6

0.8

A Continuous Piecewise Affine Function.

For second order boundary value problems, a popular and effective choice of the finitedimensional subspace is to use continuous, piecewise affine functions. Recall that a function
is affine, f (x) = a x + b, if and only if its graph is a straight line. The function is piecewise
affine if its graph consists of a finite number of straight line segments; a typical example is
plotted in Figure 10.22. Note that continuity requires that the line segments be connected
together at their ends.
Given a boundary value problem on a bounded interval [ a, b ], let us choose a set of
mesh points
a = x0 < x1 < x2 < < xn = b.
The formulas simplify if one uses equally spaced mesh points, but this is not necessary for
the method to apply. Let W denote the vector space consisting of all continuous piecewise
affine functions with corners at the nodes that satisfy the boundary conditions. To be
specific, let us treat the case of homogeneous Dirichlet (fixed) boundary conditions
w(a) = w(b) = 0.

(10.153)

Thus, on each subinterval


w(x) = cj + bj (x xj ),

for

xj x xj+1 ,

j = 0, . . . , n 1.

Continuity of w(x) requires

cj = w(x+
j ) = w(xj ) = cj1 + bj1 hj1 ,

j = 1, . . . , n 1,

(10.154)

where hj1 = xj xj1 denotes the length of the (j 1)st subinterval. The boundary
conditions (10.153) require
w(a) = c0 = 0,

w(b) = cn1 + hn1 bn1 = 0.

(10.155)

The function w(x) relies on a total of 2 n different coefficients c0 , . . . , cn1 , b0 , . . . , bn1 .


The continuity conditions (10.154) and the second boundary condition (10.155) uniquely
determine the bj , and so the coefficients cj = w(xj ), j = 1, . . . , n 1, are arbitrary. This
means that our vector space W has dimension n 1, the number of interior mesh points.
3/7/03

426

c 2003

Peter J. Olver

1.2
1
0.8
0.6
0.4
0.2
1

-0.2

Figure 10.23.

A Hat Function.

Remark : Every function w(x) in our subspace has piecewise constant first derivative
w (x). However, the jump discontinuities in w 0 (x) imply that the second derivative w 00 (x)
has a delta function impulse at each mesh point, and is therefore far from being a true
solution. Nevertheless, for most boundary value problems, the finite element minimizer
w? (x) will, in practice, provide a reasonable approximation to the true solution u ? (x).
0

The most convenient basis for the space W consists of the hat functions which are
continuous, piecewise affine functions that interpolate the same basis data as the Lagrange
polynomials (4.47) and the Bsplines (10.130), namely
j (xk ) =

1,

j = k,

0,

j 6= k,

for

j = 1, . . . , n 1,

k = 0, . . . , n.

The graph of a typical hat function appears in Figure 10.23. The explicit formula is easily
established:
xx
j1

,
xj1 x xj ,

j
j1

xj+1 x
j = 1, . . . , n 1.
(10.156)
j (x) =
,
xj x xj+1 ,

xj+1 xj

0,
x xj1 or x xj+1 ,

An advantage of using these basis elements is that the resulting positive definite coefficient
matrix (10.149) turns out to be tridiagonal. Therefore, the solution to the linear system
(10.152) is very fast; see (1.68). Since the accuracy of the finite element solution increases
with the number of mesh points, the tridiagonal solution algorithm allows us to easily
compute very accurate approximations through this scheme.

Example 10.14. Consider the equilibrium equations (10.12) for a non-uniform bar
subject to homogeneous Dirichlet boundary conditions. We begin with the minimization
3/7/03

427

c 2003

Peter J. Olver

principle (10.84), which is based on the quadratic functional


P[ u ] =

1
2

ku k hf ;ui =

1
2

c(x) u0 (x)2 f (x) u(x) dx.

We divide the interval [ 0, ` ] into n equal subintervals, each of length h = `/n. The resulting
uniform mesh is
j`
(10.157)
xj = j h =
,
j = 0, . . . , n.
n
The corresponding basis hat functions are explicitly given by

(x xj1 )/h,
j (x) =
(x
x)/h,
j+1
0,

x j x xj ,
xj x xj+1 ,
otherwise,

j = 1, . . . , n 1.

(10.158)

The associated linear system (10.152) has coefficient matrix entries


mij =

hh 0i

; 0j

ii =

`
0

0i (x) 0j (x) c(x) dx,

i, j = 1, . . . , n 1.

Since the function i (x) vanishes except on the interval xi1 < x < xi+1 , while j (x)
vanishes outside xj1 < x < xj+1 , the integral will vanish unless i = j or i = j 1.
Therefore, the coefficient matrix has the tridiagonal form

where

s + s
0
1
s1

M =

s1
s1 + s 2
s2

1
sj = 2
h

..

s2
s2 + s 3
.
sn3

s3
..
..
.
.
sn3 + sn2
sn2
sn2 sn2 + sn1

(10.159)

xj+1

c(x) dx,

j = 0, . . . , n 1,

(10.160)

xj

is the h2 times the total stiffness on the j th subinterval. The corresponding right hand
side has entries
Z `
f (x) j (x) dx
bj = h f ; j i =
0
j = 1, . . . , n 1.
"Z
#
Z xj+1
xj
1
(10.161)
=
(x xj1 )f (x) dx +
(xj+1 x)f (x) dx ,
h
xj1
xj
In practice, we do not have to explicitly evaluate the integrals (10.160), (10.161), but
may replace them by a suitably close numerical approximation. When h 1 is small,
3/7/03

428

c 2003

Peter J. Olver

0.08

0.08

0.06

0.06

0.04

0.04

0.02

0.02

0.2

0.4

0.6

0.8

0.08

0.08

0.06

0.06

0.04

0.04

0.02

0.02

0.2

0.4

0.6

0.8

Figure 10.24.

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Finite Element Solution to (10.165).

then the integrals are taken over small intervals, and we can use the trapezoid rule to
approximate them:
c(xj ) + c(xj+1 )
(10.162)
,
bj f (xj ) h.
2h
For example, in the homogeneous case c(x) 1, the matrix (10.159) reduces to the
very special form

2 1
1 2 1

1 2 1
1

..
..
..
M=
(10.163)
.
.
.
.

1 2 1
1 2
sj

The j th entry of the finite element equation M c = b is, upon dividing by h, given by

u(xj+1 ) 2u(xj ) + u(xj1 )


cj+1 2cj + cj1
=

= f (xj ).
(10.164)
h2
h2
The left hand side of (10.164) is, interestingly, minus the standard finite difference approximation to the second derivative u00 (xj ) of the displacement at the mesh point xj ;
see Section 13.5. Therefore, in this case, the finite element and finite difference numerical
solution methods happen to coincide.

Example 10.15. Consider the boundary value problem

d
du
(x + 1)
= 1,
dx
dx

u(0) = 0,

u(1) = 0.

(10.165)

One might be tempted use more accurate numerical integration procedures, but the improvement is not very significant, particularly if the step size h is small.

3/7/03

429

c 2003

Peter J. Olver

The explicit solution is easily found:


u(x) = x +

log(x + 1)
.
log 2

(10.166)

The finite difference system (10.152) has coefficient matrix given by (10.159) and right
hand side (10.161), where
Z xj+1
1 + xj
1
1
1
1
(1 + x) dx =
+ = +j+ ,
bj = h.
sj = 2
h xj
h
2
h
2
The resulting solution is plotted in Figure 10.24. The first three graphs contain, respectively, 5, 10, 20 points in the mesh, so that h = .2, .1, .05, while the last plots the exact
solution (10.166). Thus, even for rather coarse meshes, the finite element approximation
is quite good.
Example 10.16. Consider the SturmLiouville boundary value problem
u00 + (x + 1)u = x ex ,

u(0) = 0,

u(1) = 0.

(10.167)

The solution minimizes the quadratic functional (10.136), which in this particular case is
Z 1

1 0 2 1
2
x
P[ u ] =
(10.168)
2 u (x) + 2 (x + 1) u(x) e u(x) dx,
0

over all functions u(x) that satisfy the boundary conditions. We lay out a uniform mesh
of step size h = 1/n and the corresponding basis hat functions as in (10.158). Using the
trapezoid approximation, the matrix entries are
2 2
+ h (xi + 1),
i = j,

Z 1
h 3

0
mij =
i (x) 0j (x) + (x + 1) i (x) j (x) dx
h1 + 16 h (xi + 1), | i j | = 1,

0
otherwise,
while

bi = h x e ; i i =

x ex i (x) dx xi exi h.

The resulting solution is plotted in Figure 10.25. As in the previous figure, the first three
graphs contain, respectively, 5, 10, 20 points in the mesh, while the last plots the exact
solution.
So far, we have exclusively handled homogeneous boundary conditions. An inhomogeneous boundary value problem does not immediately fit into our framework since the set
of functions satisfying the boundary conditions does not form a subspace. As discussed
at the end of Section 10.3, one way to get around this problem is to replace u(x) by
u
e(x) = u(x) h(x), where h(x) is any function that satisfies the boundary conditions. For
example, for the inhomogeneous Dirichlet conditions
u(a) = ,

3/7/03

430

u(b) = ,

c 2003

Peter J. Olver

0.1

0.1

0.08

0.08

0.06

0.06

0.04

0.04

0.02

0.02

0.2

0.4

0.6

0.8

0.1

0.1

0.08

0.08

0.06

0.06

0.04

0.04

0.02

0.02

0.2

0.4

0.6

0.8

Figure 10.25.

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Finite Element Solution to (10.167).

one can use an affine function


h(x) =

( )x + b a
.
ba

Linearity implies that the difference u


e(x) = u(x)h(x) will satisfy the modified differential
equation
K[ u
e ] = fe,
where
fe = f K[ h ],

with homogeneous boundary conditions. The modified homogeneous boundary value problem can then be solved by the standard finite element method. Another possible choice
for the modifier function h(x) is a combination of elements at the endpoints:
h(x) = 0 (x) + n (x),

where 0 , n are again piecewise affine, and equal to 1 at the end nodes x0 = a, xn = b,
respectively, and zero at all other nodes. Details are left to the reader.
Finally, one can use other functions beyond the piecewise affine hat functions (10.156)
to span finite element subspace. Another popular choice essential for higher order
boundary value problems such as beams is to use splines. Thus, once we have chosen
our mesh points, we can let j (x) be the basis cubic Bsplines, as in (10.130). The
one complication is at the endpoints of the interval, where one needs to modify 1 (x) and
n1 (x) to satisfy the boundary conditions. Since j (x) = 0 for x xj2 or x xj+2 , the
coefficient matrix (10.149) is pentadiagonal, which means mij = 0 whenever | i j | > 2.
Pentadiagonal matrices are not quite as nice as their tridiagonal cousins, but they are still
quite sparse. Positive definiteness of M implies that an iterative solution technique can be
effectively applied to approximate the solution to the linear system, and thereby produce
the finite element spline approximation to the boundary value problem.
Weak Solutions
An alternative approach to the finite element solution method, which also applies when
there is no convenient minimization principle available, is through the concept of a weak
3/7/03

431

c 2003

Peter J. Olver

solution to a differential equation. Weak solutions generalize the classical notion of smooth
(differentiable) solutions, and are particularly appropriate in the study of discontinuous
and nonsmooth physical phenomena, such as shock waves, cracks and dislocations in elastic
media, singularities in liquid crystals, and so on. We refer the reader to Section 21.1 for
nonlinear models of such phenomena.
The starting point is a trivial observation: the only vector that is orthogonal to every
vector is the zero vector. More precisely,
Lemma 10.17. If V is an inner product space, then w V is orthogonal to every
vector,
hw;vi = 0
for all
v V,
(10.169)
if and only if w = 0.
Proof : Choose v = w. Then (10.169) implies k w k2 = 0, and so w = 0.

Q.E.D.

Note that the result holds equally well for finite-dimensional and infinite-dimensional
vector spaces. Suppose we are trying to solve a linear system
K[ u ] = f ,

(10.170)

where K: U V is a linear operator between inner product spaces. Using the lemma, this
can be reformulated as requiring
h K[ u ] ; v i = h f ; v i

for all

v V.

According to the definition, one can replace K by its adjoint K : W V , and require
h u ; K [ v ] i = h f ; v i

for all

v V.

(10.171)

The latter is the weak formulation of our original equation.


In the finite-dimensional situation, when K is merely multiplication by some matrix,
this approach has not really accomplished anything except make the solution to the original
system more complicated. However, in the infinite-dimensional situation, when K is a
differential operator, then the original boundary value problem K[ u ] = f requires that u
be sufficiently differentiable, whereas the weak formulation
h u ; K [ ] i = h f ; i

for all

requires only that the test function (x) be smooth. This approach has the potentiality
of allowing much rougher, perhaps even discontinuous, weak solutions u(x) to the original
problem.

The method also extends to nonlinear equations.

3/7/03

432

c 2003

Peter J. Olver

Example 10.18. Consider the boundary value problem for a bar (10.12). The weak
formulation is obtained by integration by parts. We initially restrict to test functions which
vanish at the boundary (0) = (`) = 0. This requirement will eliminate any boundary
terms in the integration by parts computation

Z `
Z `
du
d
du d
h K[ u ] ; i =

c(x)
(x) dx =
dx
c(x)
dx
dx
dx dx
0
0
(10.172)
Z `
=

f (x) (x) dx = h f ; i.

This semi-weak formulation of the equation is known in mechanics as the principle


of virtual work . A second integration by parts, and restricting to test functions whose
derivatives also vanish at the boundary, 0 (0) = 0 (`) = 0, produces the weak formulation
(10.171)

Z `
Z `
d
d
c(x)
dx =
h u ; K[ ] i =
f (x) (x) dx = h f ; i.
(10.173)
u(x)
dx
dx
0
0
Now, even discontinuous functions u(x) are allowed as weak solutions. The goal is to
find u(x) such that this condition holds for all smooth test functions (x). For example, any function u(x) which satisfies the differential equation (10.12) except at points of
discontinuity qualifies as a weak solution.
In a finite element or Galerkin approximation to the weak solution, one restricts
attention to a finite-dimensional subspace W spanned by functions 1 , . . . , n , and requires
that the approximate solution
w(x) = c1 1 (x) + + cn n (x)

(10.174)

satisfy the orthogonality condition only for elements W of the subspace. As usual, this
only needs to be checked on the basis elements. Substituting (10.174) into the semi-weak
form of the system, (10.172), produces a linear system of equations of the form
h w ; K[ i ] i =

n
X

mij cj = bi = h f ; i i,

i = 1, . . . , n.

(10.175)

i=1

The reader will recognize this as exactly the same finite element linear system (10.152)
derived through the minimization approach. In this way, the weak formulation recovers the
same finite element linear system (10.152). Therefore, for a self-adjoint boundary value
problem, the weak formulation and the minimization principle, when restricted to the
finite-dimensional subspace W , lead to exactly the same equations for the finite element
approximation to the solution.
In non-self-adjoint situations, the weak formulation is still applicable even though
there is no underlying minimization principle. On the other hand, unlike the positive
definite case, there is no guarantee that either the original boundary value problem or
its finite element approximation have a solution. Indeed, it is entirely possible that the
boundary value problem has a solution, but the finite element matrix system does not.
3/7/03

433

c 2003

Peter J. Olver

Even more worrying would be cases in which the finite element system has a solution,
but there is, in fact, no actual solution to the boundary value problem! Nevertheless, in
many situations, the weak solution approach leads to a perfectly acceptable finite element
approximation to the true solution to the system. Details can be found in [weak].

3/7/03

434

c 2003

Peter J. Olver

Chapter 11
Fourier Series
Fouriers spectacular discovery that almost any periodic function can be decomposed
into pure trigonometric functions was a milestone in applied (and pure) mathematics.
Fouriers theory has the remarkable consequence that any periodic signal no matter
how wild the wave form can be decomposed into a sum of pure periodic sine and
cosine waves. For example, a musical instrument makes a certain sound; decomposing
it into a Fourier series shows which fundamental frequencies (tones, overtones, etc.) are
summed together to produce the particular sound of the instrument, be it piano, violin,
trumpet, oboe, or drum. The Fourier decomposition lies at the heart of modern electronic
music; a synthesizer combines the fundamental pure sine and cosine tones to reproduce the
different sounds of instruments, both known and unknown, according to Fouriers general
prescription.
The key to the efficacy of Fourier series rests on the orthogonality properties of the
trigonometric functions. This is a direct consequence of their status as eigenfunctions
of the simplest self-adjoint boundary value problem. In this manner, Fourier series can
also be viewed as the simplest function space version of the finite-dimensional spectral
theory of symmetric matrices and orthogonal eigenvector bases. The key complication is
that we must deal with infinite series rather than finite sums, and so convergence issues
that do not appear in the finite-dimensional situation become of paramount importance.
The Fourier trigonometric series is the simplest of a broad class of infinite series based
on the eigenfunctions of self-adjoint boundary value problems. Other important examples
arising in physical applications, including Bessel and Legendre functions, will appear in
Chapters 16 and 17.
In this chapter, we concentrate on classical Fourier series. A variety of extensions
of Fourier analysis, including the Fourier transform, the Laplace transform, the discrete
Fourier transform, wavelets, will be presented in the ensuing chapter. The range of succesful applications of Fourier series, Fourier integrals, and their relatives is truly remarkable,
encompassing the solution to partial differential equations, vibrations and waves, signal
processing, data compression, digital image processing, and many, many other fields, including many areas of pure mathematics such as group theory and number theory.
We begin with a section designed to motivate how Fourier series are a very natural
outgrowth of the eigenvalue/eigenvector methods used to solve systems of linear ordinary differential equations that govern the dynamical behavior of discrete mechanical and
electraical systems. The corresponding systems that model the dynamical behavoir of continuous mechanical systems such as bars are partial differential equations. The solutions
are written in terms of the associated eigenfunctions which, in the very simplest situation,
3/7/03

433

c 2003

Peter J. Olver

are the trigonometric functions that form the foundation of Fourier methods. The reader
uninterested in motivation can safely omit this section as the same material reappears in
the subsequent chapters devoted to complete analysis of the dynamical partial differential equations that lead to Fourier methods. Beginning in Section 11.2, we shall review,
omitting proofs, the basic methods of Fourier series in the following sections, including
applications to both ordinary and generalized functions such as the delta function, and
explain why they are of such utility in applications. The final, optional section gives some
of the analytical background required to form a rigorous foundation for Fourier series,
although space considerations necessitate that it fall short of a complete introduction to
the theory, which can be found in more specialized texts.

11.1. Dynamical Equations of Continuous Media.


The purpose of this section is to discover why Fourier series arise naturally when we
move from discrete systems of ordinary differential equations to the partial differential
equations that govern the dynamics of continuous mechanical systems. Thus, replacing
a system of discrete masses and springs will be a continuum, e.g., a one-dimensional bar
or string, a two-dimensional plate or a three-dimensional solid body. Of course, physical
bodies are composed of atoms, and hence could in principle be modeled by discrete mechanical systems. However, the number of atoms is so large that any direct attempt to solve
the resulting system of ordinary differential equations would be completely impractical.
Thus, regarding physical bodies as continuous media not only provides extremely accurate
physical models, but turns out to be absolutely crucial for making significant progress on
the mathematical and computational analysis of such system. Interestingly, the numerical solution of such partial differential equations returns us to the discrete realm. While
one might envision going directly from the discrete atomic system to the typically much
smaller discrete numerical approximation, this is not such a simple matter. The analytical
power and insight offered by calculus in the continuous regime makes this intermediate
step essential to effective modeling of physical phenomena.
The two principal classes that will be treated are the first order systems (8.64) governing gradient flows, and the second order Newtonian vibration systems (8.77). The
former will naturally lead to diffusion equations, including the heat equation that models
the propagation of heat in a homogeneous body. The latter will lead to general vibration
equations for bars, strings, and, in higher dimensions, plates and solid bodies.
In Chapter 6 we characterized the equilibrium equations of discrete mechanical and
electrical systems as a linear algebraic system
Ku = f
with symmetric, positive (semi-)definite coefficient matrix K. There are two fundamental
types of dynamical systems associated with such equilibrium equations. Free vibrations are
governed by Newtons Law, which leads to a second order system of ordinary differential
equations of the form
d2 u
= K u.
(11.1)
dt2
3/7/03

434

c 2003

Peter J. Olver

The gradient flow equations


du
= K u,
dt

(11.2)

are designed to decrease the quadratic energy function q(u) = 21 uT K u as rapidly as


possible. In each case, the solution to the system was made by imposing a particular ansatz,
either trigonometric or exponential, which reduces us to the analysis of the eigenvalue
equation
Kv = v
for the matrix K. Each eigenvalue and eigenvector created a particular solution or natural
mode of the dynamical system. In the positive definite gradient flow case, the natural
modes are the exponential solutions e t v; in the case of stable vibratiosns, they are the
trigonometric solutions cos t v, sin t v with 2 = . In either case, the general solution
could be expressed as a superposition of these fundamental modes.
As we discovered in Chapter 10, the same abstract formalism applies to the equilibrium
equations of one-dimensional media bars, beams, etc. The positive (semi-)definite
matrix is replaced by a positive (semi-)definite linear operator. Formally, the equilibrium
system takes the same form
K[ u ] = f
(11.3)
with the proper boundary conditions being included so as to ensure self-adjointness of
the differential operator K. One therefore expects the corresponding dynamical processes
of continuous media to be modeled by the same general formalism, and this intuition is
correct.
Consider first a gradient flow system of the form
u
= K[ u ].
t

(11.4)

Such differential equations model diffusion processes in which a quadratic energy functional
is decreasing as rapidly as possible. A good physical example is the flow of heat in a body;
the heat disperses throughout the body so as to decrease the thermal energy as quickly
as it can, tending (in the absence of external heat sources) to thermal equilibrium. Other
physical processes modeled by such systems include diffusion of solvents, pollutants, etc.,
and of populations (of animals, bacteria, etc.) in a medium.
The simplest and most consequential example is the case of a uniform periodic (or
circular) bar. As we saw in Chapter 10, the equilibrium equation (11.3) takes the form
u00 = f,

u( ) = u(),

u0 ( ) = u0 (),

(11.5)

where, for later convenience, we take the interval x of length 2 . The selfadjoint operator is the second order derivative
K = D D = ( D) D = D 2
3/7/03

435

(11.6)
c 2003

Peter J. Olver

acting on the space of 2 periodic functions. The corresponding diffusion equation (11.4)
is the partial differential equation
u
2u
=
,
t
x2

u(t, ) = u(t, ),

u
u
(t, ) =
(t, ),
x
x

(11.7)

known as the heat equation since it models (among other diffusion processes) heat flow.
The function u(x, t) represents the temperature at position x and time t. Heat naturally
flows from hot to cold, and so the fact that it can be described by a gradient flow should
not be surprising; a derivation of (11.7) from physical principles will appear in Chapter 13.
Solving the periodic heat equation was the seminal problem that led Fourier to develop
the profound theory that now bears his name.
As in the discrete version, the natural mode solutions to a diffusion equation (11.4)
are found by using an exponential ansatz:
u(x, t) = e t v(x),

(11.8)

in which we replace the eigenvector v by a continuous function v(x). Such solutions are
known as separable, which indicates that they are given as the product of a function of t
alone and a function of x alone. Substituting (11.8) into the dynamical equations (11.4),
since the exponential factor only depends on t, it is unaffected by the differential operator
K, which only involves differentiation with respect to x. The net result is that v(x) is
required to solve an eigenvalue problem of the form
K[ v ] = v,

(11.9)

in which is the eigenvalue and v(x) is the corresponding eigenfunction for the operator
K. The eigenfunction is always required to satisfy the relevant boundary conditions. Each
eigenvalue and eigenfunction pair will produce a solution (11.8) to the partial differential
equation, and the general solution can be built up through superposition.
Substitution of the exponential ansatz (11.8) into the periodic heat equation (11.7)
leads to the eigenvalue problem
v 00 + v = 0,

v( ) = v(),

v 0 ( ) = v 0 ().

(11.10)

This constitutes a 2 periodic boundary value problem for the eigenfunction v(x). The
operator K = D 2 is positive semi-definite on the subspace of 2 periodic functions.
As with matrices, this immediately implies that its eigenvalues must be real and nonnegative: 0. (See Exercise .) Indeed, as the reader can verify, if < 0 or is
complex, then the only periodic solution to (11.10) is the trivial solution v(x) 0. When
= 0, the periodic solutions to (11.10) are the constant functions, and so any nonzero
constant function v(x) c is an eigenfunction for the = 0 eigenvalue. For the positive

Since u(t, x) now depends upon time as well as position, we switch from ordinary to partial
derivative notation.

3/7/03

436

c 2003

Peter J. Olver

eigenvalues, if we write = 2 with > 0, then the general solution to the differential
equation (11.10) is a linear combination
v(x) = a cos x + b sin x.
A nonzero function of this form will satisfy the 2 periodic boundary conditions if and
only if = k is an integer. Therefore, the eigenvalues
= k2 ,

0 k N,

are the squares of positive integers. Each positive eigenvalue = k 2 > 0 admits two
linearly independent eigenfunctions, namely sin k x and cos k x, while the zero eigenvalue
= 0 has only one independent eigenfunction, the constant function 1. We conclude that
the standard trigonometric functions
1,

cos x,

sin x,

cos 2 x,

sin 2 x,

cos 3 x,

...

(11.11)

form a complete system of independent eigenfunctions for the periodic boundary value
problem (11.10). Moreover, owing to their status as eigenfunctions of a self-adjoint boundary value problem, they are orthogonal with respect to the standard L 2 inner product
on the interval [ , ], a fact previously noted in Example 5.13. As we shall discover,
orthogonality is the key to unraveling the mysteries of Fourier series!
Each eigenfunction gives rise to a particular solution to the periodic heat equation
(11.7). We have therefore constructed an infinite collection of independent solutions:

uk (x) = exp k 2 t cos k x,


u
ek (x) = exp k 2 t sin k x,
k = 0, 1, 2, 3, . . . .
According to our linear superposition principle, any finite linear combination
u(x, t) = a0 +

n h
X

ak e

k2 t

cos k x + bk e

k=1

k2 t

sin k x

(11.12)

of these particular solutions is also a solution. However, finite linear combinations will
not suffice to describe the general solution to the problem, and we must replace the finite
sum (11.12) by an infinite series. This immediately raises deep and interesting analytical
questions. When does such an infinite series converge? Can we represent a given function
f (x) as such an infinite series, and if so, how? For the trigonometric eigenfunctions, these
are the fundamental questions of Fourier analysis. After we have firmly established the
basics of Fourier theory, we shall then return to these questions for both the heat and wave
equations in Chapter 13.
A similar analysis applies to a second order system of the Newtonian form
2u
= K[ u ].
t2

(11.13)

Such differential equations are used to describing the free vibrations of continuous mechanical systems, such as bars, strings, and, in higher dimensions, membranes, solid bodies,
3/7/03

437

c 2003

Peter J. Olver

fluids, etc. For example, the vibration system (11.13) corresponding to the differential
operator (11.6) is the wave equation
2u
2u
=
.
t2
x2

(11.14)

The wave equation models stretching vibrations of a bar, sound vibrations in a column of
air, e.g., inside a wind instrument, transverse vibrations of a string, e.g., a violin string,
surfaces waves on a fluid, electromagnetic waves, and a wide variety of other vibrational
and wave phenomena.
As always, we need to impose suitable boundary conditions in order to proceed. Consider, for example, the wave equation with homogeneous Dirichlet boundary conditions
2u
2u
=
,
t2
x2

u(t, 0) = 0,

u(t, `) = 0,

(11.15)

that models, for instance, the vibrations of a uniform violin string whose ends are tied
down. Adapting our discrete trigonometric ansatz, we are naturally led to look for a
separable solution of the form
u(t, x) = cos t v(x)
(11.16)
in which represents the vibrational frequency. Substituting into the wave equation
and the associated boundary conditions, we deduce that v(x) must be a solution to the
eigenvalue problem
d2 v
(11.17)
+ 2 v = 0,
v(0) = 0 = v(`),
2
dx
in which 2 = plays the role of the eigenvalue. For 2 > 0 which results from positive
definiteness of the underlying system the general solution to the differential equation is
a trigonometric function

v(x) = a cos x + b sin x.


The boundary condition at x = 0 requires a = 0, and so
v(x) = b sin x.
The second boundary condition requires
v(`) = b sin ` = 0.
Assuming b 6= 0, as otherwise the solution is trivial, ` must be an integer multiple of .
Thus, the natural frequencies of vibration are
k =

k
,
`

k = 1, 2, 3, . . . .

The corresponding eigenfunctions are


vk (x) = sin
3/7/03

kx
,
`
438

(11.18)

k = 1, 2, 3, . . . .
c 2003

Peter J. Olver

Thus, we find the following natural modes of vibration of the wave equation:
uk (t, x) = cos

kt
kx
sin
,
`
`

u
ek (t, x) = sin

kt
kx
sin
.
`
`

Each solution represents a spatially periodic standing wave form. We expect to write the
general solution to the boundary value problem as an infinite series
u(t, x) =

k=1

kx
kt
kx
kt
sin
+ dk sin
sin
bk cos
`
`
`
`

(11.19)

in the natural modes. Interestingly, in this case at each fixed t, there are no cosine terms,
and so we have a more specialized type of Fourier series. The same convergence issues for
such Fourier sine series arise. It turns out that the general theory of Fourier series will
also cover Fourier sine series.
We have now completed our brief introduction to the dynamical equations of continuous media and the Fourier series method of solution. The student should now be sufficiently
motivated, and it is time to delve into the theory of basic Fourier series. We will not try to
deal with more general eigenfunction expansions until Chapter 16, but instead concentrate
on the simplest and most important special case, when the eigenfunctions are trigonometric functions, and the series is a classical Fourier series. In Chapter 13 we will return to
the applications to the one-dimensional heat and wave equations.

11.2. Fourier Series.


The preceding section served as a launching pad to motivate Fourier analysis. While
the applications to partial differential equations provided our (and Fouriers) route into the
fundamental definition, the remarkable range of applications qualifies Fouriers discovery
as one of the most important innovations in mathematical history. We commence with the
key definition.
Definition 11.1. A Fourier series is an infinite trigonometric series
f (x)
The extra factor of

1
2

X
a0
+
[ ak cos k x + bk sin k x ] .
2

(11.20)

k=1

is traditionally included in the first term for later convenience.

Of course, without additional assumptions on the coefficients ak , bk , the Fourier series


(11.20) may not converge. This is the reason that we use the symbol instead of an equals
sign.
Remark : We have chosen the interval [ , ] for convenience. A common alternative
choice is the interval [ 0, 2 ]. In fact, since the trigonometric functions are 2 periodic,
any interval of length 2 will do equally well. Adapting Fourier series to intervals of other
lengths will be discussed in Section 11.4.
3/7/03

439

c 2003

Peter J. Olver

The key questions are


(a) First, when does such an infinite series converge?
(b) Second, what kinds of functions f (x) can be represented as an infinite trigonometric
series?
(c) Thirdly, if we have such an f , how do we determine the required coefficients a k , bk ?
(d) And lastly, since we are trying to solve differential equations, we need to know when
we may differentiate such a series.
The first order of business is to determine the formulae for the Fourier coefficients
ak , bk of a given function; only then will we discuss convergence. The key is orthogonality.
We already observed, in Example 5.13, that the trigonometric functions (11.11) form an
orthogonal system of functions with respect to the (rescaled) L2 inner product
Z
1
f (x) g(x) dx.
(11.21)
hf ;gi =

Indeed, this fact is a direct consequence of their status as eigenfunctions of a self-adjoint
boundary value problem (11.10). The explicit orthogonality relations are
h cos k x ; cos l x i = h sin k x ; sin l x i = 0,
k cos k x k = k sin k x k = 1,

k 6= 0,

k 6= l,

h cos k x ; sin l x i = 0,

k1k = 2,

(11.22)

whenever k and l are non-negative integers. Actually, the trigonometric functions (11.11)
would be orthonormal if we were to replace the constant function 1 by 12 ; however, this

2 factor turns out to be utterly annoying, and is best omitted.


If we ignore convergence issues for the moment and treat the Fourier series (11.20) as
a finite sum, then the orthogonality relations (11.22) serve to immediately determine the
Fourier coefficients. Taking the inner product of both sides with, respectively, cos k x and
sin k x, we find
Z
1
f (x) cos k x dx,
k = 0, 1, 2, 3, . . . ,
ak = h f ; cos k x i =

(11.23)
Z
1
bk = h f ; sin k x i =
f (x) sin k x dx,
k = 1, 2, 3, . . . .

These fundamental formulae prescribe the Fourier coefficients of the function f . The fact
that we can also use them as written for a0 is the reason for including the 12 in the constant
term of the Fourier series (11.20).
Example 11.2. Consider the function f (x) = x. We may compute its Fourier
coefficients directly, using integration by parts to evaluate the integrals:

Z
Z
1
1
1 x sin k x cos k x
= 0,
a0 =
x dx = 0,
ak =
x cos k x dx =
+

k
k2
x =


Z
1
2
x cos k x sin k x
1
bk =
= (1)k+1 .

+
(11.24)
x sin k x dx =

k
k
k

3/7/03

x =

440

c 2003

Peter J. Olver

Therefore, the Fourier cosine coefficients of the function x all vanish, a k = 0, and its
Fourier series is

sin 3 x
sin 4 x
sin 2 x
+

+ .
(11.25)
x 2 sin x
2
3
4
The convergence of this series is not an elementary matter. Standard tests, including
the ratio and root tests, that almost always work for power series, fail to apply. Secondly,
even if we know that the series converges (which it does, for all x) it is certainly not obvious
what function it converges to. Indeed, it cannot converge to the function f (x) = x for all
values of x! If we substitute x = , then every term in the series is zero, and so the Fourier
series converges to 0 which is not the same as f () = .
The nth partial sum of a Fourier series is the trigonometric polynomial
n
a0 X
sn (x) =
+
[ ak cos k x + bk sin k x ] .
2

(11.26)

k=1

The Fourier series converges at a point x if and only if the partial sums have a limit
lim sn (x) = fe(x),

(11.27)

which may or may not equal the value of the original function f (x). Thus, a key requirement is to formulate easily verifiable conditions on the function f (x) that guarantee
that the Fourier series converges, and, even more importantly, the limiting sum equals the
original function fe(x) = f (x). This will be done in detail below.
The passage from trigonometric polynomials to Fourier series is analogous to the
passage from polynomials to power series. A power series
f (x) c0 + c1 x + + cn xn + =

c k xk

k=0

can be viewed as an infinite linear combination of the basic monomials 1, x, x 2 , x3 , . . . .


According to Taylors formula, (C.8), the coefficients are given in terms of the derivatives
f (k) (0)
, not by an inner product. The partial sums
of the function at the origin, ck =
k!
sn (x) = c0 + c1 x + + cn xn =

n
X

c k xk

k=0

of a power series are ordinary polynomials, and similar convergence questions arise.
However, although superficially similar, in actuality the two theories are profoundly
different. A power series either converges everywhere, or on an interval centered at 0,

The reason for the term trigonometric polynomial was discussed at length in Example 2.17c).

3/7/03

441

c 2003

Peter J. Olver

or nowhere except at 0. (See Section 15.2 for details.) A Fourier series can converge on
quite bizarre sets. In fact, a detailed analysis of the convergence of Fourier series led
Georg Cantor to establish the foundations of modern set theory, and, thus, had a seminal
impact on the very foundations of mathematics and logic. Secondly, when a power series
converges, it converges to an analytic function, which is infinitely differentiable, and whose
derivatives are also represented by power series, obtained by termwise differentiation. A
Fourier series can converge, not only to an arbitrary periodic continuous function, but even
to a wide variety of discontinuous functions and, when suitably interpreted, to generalized
functions like the delta function! Therefore, the termwise differentiation of a Fourier series
is a nontrivial issue. Indeed, while the theory of power series was well established in the
early days of the calculus, there remain, to this day, unresolved foundational issues in
Fourier theory.
Once one understands how different the two subjects are, one begins to understand
why Fouriers astonishing claims were widely disbelieved. Before the advent of Fourier,
most mathematicians only viewed analytic functions as genuine. The fact that Fourier series can converge to nonanalytic, even discontiuous functions was extremely disconcerting,
and led to a complete re-evaluation of function theory, culminating in the definition of a
function that you now learn in first year calculus. Only through the combined efforts of
many of the leading mathematicians of the nineteenth century was a rigorous theory of
Fourier series firmly established.
Periodic Extensions
The trigonometric constituents (11.11) of a Fourier series are all periodic functions
of period 2 . Therefore, if the series converges, the resulting function fe(x) must also be
periodic of period 2 :
fe(x + 2 ) = fe(x)

for all

x R.

A Fourier series can only converge to a 2 periodic function. Therefore, we should not
expect the Fourier series for f (x) = x to converge to x everywhere, since x is not periodic.
Rather, it converges to its 2 periodic extension, as we now define.
Lemma 11.3. If f (x) is any function defined for < x , then there is a unique
2 periodic function fe, known as the 2 periodic extension of f , that satisfies fe(x) = f (x)
for all < x .
Proof : Given x R, there is a unique integer m so that < x 2 m .
Periodicity of fe leads us to define
fe(x) = fe(x 2 m ) = f (x 2 m ).

In particular, if < x , then m = 0 and hence fe(x) = f (x). The proof that the
resulting function fe is 2 periodic is an easy exercise.
Q.E.D.
Pictorially, the graph of the periodic extension of a function f (x) is obtained by
repeatedly copying that part of the graph of f between and to all other adjacent
intervals of length 2 .
3/7/03

442

c 2003

Peter J. Olver

3
2
1
-5

15

10

-1
-2
-3

Figure 11.1.

Periodic extension of x.

Remark : The construction of the periodic extension of Lemma 11.3 uses the value
f () at the right endpoint and requires fe( ) = fe() = f (). One could, alternatively,
require fe() = fe( ) = f ( ), which, if f ( ) 6= f (), leads to a slightly different
2 periodic extension of the function. The two extensions only differ when x is an odd
multiple of . There is no a priori reason to prefer one over the other. In fact, for Fourier
theory, as we shall discover, one should use neither, but rather an average of the two.
Thus, the preferred Fourier periodic extension fe(x) will satisfy

fe() = fe( ) = 21 f () + f ( ) ,
(11.28)
which then fixes its values at the odd multiples of .

Example 11.4. The 2 periodic extension fe(x) of f (x) = x is the sawtooth function graphed in Figure 11.1. It agrees with x between and . If we adopt the Fourier
extension (11.28), then we set fe(k ) = 0 for any odd integer k With this convention, it
can be proved that the Fourier series (11.25) for f (x) = x converges everywhere to the 2
periodic extension fe(x). In particular,

X
x,
< x < ,
k+1 sin k x
2
(1)
=
(11.29)
k
0,
x = .
k=1

Even this very simple example has remarkable and nontrivial consequences. For instance, if we substitute x = 21 in (11.25) and divide by 2, we obtain Gregorys series

1
1
1
1
= 1
+

+
.
(11.30)
4
3
5
7
9
While this remarkable formula predates Fourier theory it was first discovered by Leibniz
it is quite difficult to establish its validity directly.
Remark : While fascinating from a numerological viewpoint, Gregorys series is of
scant practical use for actually computing since it converges extremely slowly. The
reader may wish to try adding up terms to see how far out one needs to go to accurately
compute even the first two decimal digits of . Round-off errors will eventually interfere
with any attempt to accurately compute the complete summation to any reasonable degree
of accuracy.
3/7/03

443

c 2003

Peter J. Olver

1
0.5

-1

-0.5
-1

Figure 11.2.

Piecewise Continuous Function.

Piecewise Continuous Functions


As we shall see, all continuous, 2 periodic functions can be represented as convergent
Fourier series. More generally, we can allow the function to have some simple discontinuities. Although not the most general class of functions that have convergent Fourier series,
such piecewise continuous functions will suffice for all the applications we consider in
this book.
Definition 11.5. A function f (x) is called piecewise continuous on an interval [ a, b ]
if it is defined and continuous except possibly at a finite number of points a x 1 < x2 <
. . . < xn b. At each point of discontinuity, the left and right hand limits
f (x+
k ) = lim+ f (x),

f (x
k ) = lim f (x),

x xk

x xk

exist. Note that we do not require that f (x) be defined at xk . Even if f (xk ) is defined, it
does not necessarily equal either the left or the right hand limit.
A function f (x) defined for all x R is piecewise continuous provided it is piecewise continuous on every bounded interval. In particular, a 2 periodic function fe(x) is
piecewise continuous if and only if it is piecewise continuous on the interval [ , ].
A representative graph of a piecewise continuous function appears in Figure 11.2. The
points xk are known as jump discontinuities of f (x) and the difference

k = f (x+
k ) f (xk ) = lim+ f (x) lim f (x)
x xk

(11.31)

x xk

between the left and right hand limits is the magnitude of the jump, cf. (10.43). If k = 0,
and so the right and left hand limits agree, then the discontinuity is removable since

redefining f (xk ) = f (x+


k ) = f (xk ) makes f continuous at xk . Thus, we may assume,
without loss of generality, that our functions have no removable discontinuities.

At the endpoints a, b we only require one of the limits, namely f (a+ ) and f (b ), to exist.

3/7/03

444

c 2003

Peter J. Olver

The simplest example of a piecewise continuous function is the step function

1,
x > 0,
(x) =
(11.32)
0,
x < 0.
The step function has a single jump discontinuity at x = 0 of magnitude 1, and is continuous indeed, constant everywhere else. If we translate and scale the step function,
we obtain a function

,
x > y,
h(x) = (x y) =
(11.33)
0,
x < y,
with a single jump discontinuity of magnitude at the point x = y.
As we saw in Section 10.2, every function with a single jump discontinuity can be
written as a sum
f (x) = g(x) + (x y),
(11.34)

of a continuous function g(x) and a multiple of the step function (11.33), where = f (y + )
f (y ) is the magnitude of the jump in f at y. The equation (11.34) holds everywhere except
possibly at the jump x = y. In fact, except at the discontinuity points, every piecewise
continuous function can be written as a sum
f (x) = g(x) +

n
X

k=1

k (x xk ),

x 6= x1 , . . . , xn ,

of a continuous function g(x) and a finite linear combination of step functions at the

discontinuities with the coefficients k = f (x+


k ) f (xk ) representing the magnitudes of
the jumps. Examples can be found in Section 10.2.
If f (x) is any piecewise continuous function, then its Fourier coefficients are welldefined the integrals (11.23) exist and are finite. Continuity, however, is not enough to
ensure convergence of the resulting Fourier series.
Definition 11.6. A function f (x) is called piecewise C1 on an interval [ a, b ] if it is
defined, continuous and continuously differentiable except possibly at a finite number of
points a x1 < x2 < . . . < xn b. At each exceptional point, the left and right hand
limits
f (x+
f (x
k ) = lim+ f (x),
k ) = lim f (x),
xxk

xxk

0
f 0 (x
k ) = lim f (x),

0
f 0 (x+
k ) = lim f (x),

xx
k

xx+
k

exist. See Figure 11.3 for a representative graph. For a piecewise continuous C 1 function,
an exceptional point xk is either
(a) a jump discontinuity of f , but where the left and right hand derivatives exist, or
(b) a corner , meaning a point where f is continuous, but it has different left and right
hand derivatives.
As before, at the endpoints we only require the appropriate one-sided limits, namely f (a + ),
f 0 (a+ ) and f (b ), f 0 (b ), to exist.

3/7/03

445

c 2003

Peter J. Olver

1
0.5

-1

-0.5
-1

Figure 11.3.

Piecewise C1 Function.

Thus, at each point, including jump discontinuities, the graph of f (x) has well-defined
right and left tangent lines. For example, the function f (x) = | x | is piecewise C 1 since it
is continuous everywhere and has a corner at x = 0, with f 0 (0+ ) = + 1, f 0 (0 ) = 1.
There is an analogous definition of a piecewise Cn function. One requires that the
function has n continuous derivatives, except at a finite number of points. Moreover, at
every point, the function has well-defined right and left hand limits of all its derivatives
up to order n.
The Convergence Theorem
The fundamental convergence theorem for Fourier series can now be stated.
Theorem 11.7. If fe(x) is any 2 periodic, piecewise C1 function, then its Fourier
series converges for all x to
fe(x)

1 e +
f (x ) + fe(x )
2

if fe is continuous at x,

if x is a jump discontinuity.

Therefore, at discontinuities, the Fourier series splits the difference and converges
to the average of the right and left hand limits. If we define fe at its discontinuities to have
this value,

(11.35)
fe(x) = 21 fe(x+ ) + fe(x )

an equation that automatically holds at all points of continuity then Theorem 11.7
would say that the Fourier series converges to fe(x) everywhere. We will prove the Convergence Theorem 11.7 in Section 11.5.
Remark : There are pathological examples of continuous functions whose Fourier series
do not converge. Since such functions are quite complicated, and do not play a role in
applications, we will not pursue them here.
3/7/03

446

c 2003

Peter J. Olver

1
0.5

-5

10

15

-0.5
-1

Periodic Step Function.

Figure 11.4.

Example 11.8. Let (x) denote the step function (11.32). Its Fourier coefficients
are easily computed:
Z
Z
1
1
(x) dx =
dx = 1,
a0 =

0
Z
Z
1
1
ak =
(x) cos k x dx =
cos k x dx = 0,

0

Z
Z
2 ,
k = 2 l + 1 odd,
1
1
k
bk =
(x) sin k x dx =
sin k x dx =


0
0,
k = 2 l even.
Therefore, the Fourier series for the step function is

2
sin 5 x
sin 7 x
1
sin 3 x
+
+
+
+ .
(x)
sin x +
2

3
5
7

According to Theorem 11.7, the Fourier


of the step function, which is

0,
1,

e(x) =

1
2,

(11.36)

series will converge to the 2 periodic extension


(2 m + 1) < x < 2 m ,
2 m < x < (2 m + 1) ,
x = m ,

where m denotes an arbitrary integer. A graph appears in Figure 11.4. In accordance with
Theorem 11.7,
e(x) takes the midpoint value 21 at the jump discontinuities 0, , 2 , . . ..
It is instructive to analyze the convergence of this particular Fourier series in a little
detail. Figure 11.5 displays a graph of the first few partial sums, taking, respectively,
n = 3, 5, and 10 terms. The reader will notice that away from the jumps, the series
does appear to be converging, albeit slowly. However, near the jumps there is a consistent
overshoot of about 9% on both sides of the discontinuities. The region where the overshoot
occurs becomes narrower and narrower as the number of terms increases, but the magnitude
of the overshoot persists no matter how many terms are summed up. The persistence of
the overshoot was first noted by the American physicist Josiah Gibbs, and is known as
3/7/03

447

c 2003

Peter J. Olver

-3

-2

0.5

0.5

0.5

-1

-3

-1

-2

-3

-2

-1

-0.5

-0.5

-0.5

-1

-1

-1

Gibbs Phenomenon.

Figure 11.5.

the Gibbs phenomenon in his honor. The Gibbs overshoot is a manifestation of the subtle
non-uniform convergence of the Fourier series.
Even and Odd Functions
We already noted that the Fourier cosine coefficients of the function f (x) = x were
all 0. This was not an accident, but rather a consequence of the fact that x is an odd
function. Recall first the basic definition:
Definition 11.9. A function is called even if f ( x) = f (x). A function is odd if
f (x) = f (x).
For example, the functions 1, cos k x and x2 are all even, whereas sin k x and x are
both odd. We need two elementary lemmas, whose proofs are left to the reader.
Lemma 11.10. The sum, f (x) + g(x), of two even functions is even; the sum of two
odd functions is odd. The product f (x) g(x) of two even functions, or of two odd functions,
is an even function. The product of an even and an odd function is odd.
Remark : Every function can be represented as the sum of an even and an odd function;
see Exercise .
Lemma 11.11. If f (x) is odd and integrable on the symmetric interval [ a, a ], then
Z a
f (x) dx = 0.
a

If f (x) is even and integrable on [ a, a ], then


Z
Z a
f (x) dx = 2
a

f (x) dx.
0

The next result is an immediate consequence of Lemmas 11.10 and 11.11 applied to
the Fourier integrals (11.23).
3/7/03

448

c 2003

Peter J. Olver

Proposition 11.12. If f (x) is even, then its Fourier sine coefficients all vanish,
bk = 0, and so f can be represented by a Fourier cosine series
f (x)
where
2
ak =

X
a0
+
ak cos k x ,
2

(11.37)

k=1

f (x) cos k x dx,

(11.38)

k = 0, 1, 2, 3, . . . .

If f (x) is odd, then its Fourier cosine coefficients vanish, ak = 0, and so f can be represented
by a Fourier sine series

X
bk sin k x ,
(11.39)
f (x)
k=1

where
2
bk =

f (x) sin k x dx,

(11.40)

k = 1, 2, 3, . . . .

Conversely, a convergent Fourier cosine (sine) series always represents an even (odd) function.
Remark : If fe(x) is either even or odd and 2 periodic, then it is uniquely determined
by its values on the interval [ 0, ]. This is proved by a straightforward adaptation of the
proof of Lemma 11.3; see Exercise .
Example 11.13. The absolute value f (x) = | x | is an even function, and hence has
a Fourier cosine series. The coefficients are computed to be
2
a0 =

2
ak =

Z
Z

x dx = ,

(11.41)

2
x cos k x dx =

x sin k x cos k x
+
k
k2

Therefore
|x|

x=0

0,

0 6= k even,

4
k2

cos 3 x
cos 5 x
cos 7 x
cos x +
+
+
+
9
25
49

k odd.

(11.42)

According to Theorem 11.7, this Fourier cosine series converges to the 2 periodic extension
of | x |, which is graphed in Figure 11.6.
In particular, if we substitute x = 0, we obtain another interesting series

X
1
1
1
1
2
=1 +
+
+
+ =
.
2
8
9
25
49
(2n
+
1)
n=0

3/7/03

449

c 2003

(11.43)
Peter J. Olver

3
2
1
-5

15

10

-1
-2
-3

Figure 11.6.

Periodic extension of | x |.

This series converges faster than Gregorys series (11.30), and, while not optimal, can
be used to compute reasonable approximations to . One can further use this result to
compute the sum of the series

X
1
1
1
1
1
1
1
=1 +
+
+
+
+
+
+ .
S=
2
n
4
9
16
25
36
49
n=1

We note that

X
X
S
1
1
1
1
1
1
=
=
= +
+
+
+ .
2
2
4
4n
(2 n)
4
16
36
64
n=1
n=1

Therefore,
S
1
1
1
2
3
S=S =1 +
+
+
+ =
,
4
4
9
25
49
8
from which we conclude that

X
1
1
1
1
1
1
1
2
=
1
+
+
+
+
+
+
+

=
.
S=
n2
4
9
16
25
36
49
6
n=1

(11.44)

Remark : The sine and cosine functions can, in fact, be characterized as eigenfunctions
of self-adjoint boundary value problems. The functions sin k x, k = 1, 2, . . . , are the
eigenfunctions for the Dirichlet eigenvalue problem
v 00 + v = 0,

(11.45)

v(0) = v() = 0,

that corresponds to a bar with both ends fixed. On the other hand, the functions cos k x,
k = 0, 1, 2, . . . , are the eigenfunctions for the eigenvalue problem
v 00 + v = 0,

v 0 (0) = v 0 () = 0,

(11.46)

with Neumann boundary conditions, corresponding to a bar with both ends free. Thus,
we expect them to play a key role in the corresponding dynamical equations.
3/7/03

450

c 2003

Peter J. Olver

Complex Fourier Series


An alternative, and often more convenient, approach to Fourier series is to use complex
exponentials instead of sines and cosines. Since e i k x and e i k x are linearly independent
complex eigenfunctions of the periodic boundary value problem (11.10) for the eigenvalue
= k 2 , they can be used as an alternative basis for the (complex) eigenspace. Indeed,
Eulers formula
(11.47)
e i k x = cos k x + i sin k x,
shows how to write the trigonometric functions
e i k x + e i k x
e i k x e i k x
(11.48)
,
sin k x =
,
2
2i
in terms of complex exponentials. Orthonormality with respect to the rescaled L 2 Hermitian inner product
Z
1
f (x) g(x) dx ,
(11.49)
hf ;gi =
2
cos k x =

was proved in Example 3.45:

1
h e i kx ; e i lx i =
2
1
k e i k x k2 =
2

Z
Z

e i (kl)x dx =

1,

k = l,

0,

k 6= l,

(11.50)

| e i k x |2 dx = 1.

The complex Fourier series for a (piecewise continuous) real or complex function f is
given by
f (x)

k =

ck e i k x = + c2 e2 i x + c1 e i x + c0 + c1 e i x + c2 e2 i x + . (11.51)

The complex Fourier coefficients are computed by taking Hermitian inner products
Z
1
ikx
f (x) e i k x dx.
(11.52)
ck = h f ; e
i=
2
Pay attention to the minus sign in the integrated exponential the result of taking the
complex conjugate of the second argument in the inner product (11.49).

Remark : We already see one advantage of the complex version. The constant function
1 = e0 i x no longer plays an anomalous role the annoying factor 21 in the real series
(11.20) has mysteriously disappeared!
It is worth emphasizing that the real (11.20) and complex (11.51) Fourier formulae
are just two different ways of writing the same series! Indeed, if we apply Eulers formula (11.47) to (11.52) and compare with the real Fourier formulae (11.23), we find that
the real and complex Fourier coefficients are related by
ak = ck + ck ,

ck = 12 (ak i bk ),

3/7/03

bk = i (ck ck ),

ck = 21 (ak + i bk ),
451

k = 0, 1, 2, . . . .
c 2003

(11.53)
Peter J. Olver

Example 11.14. For the step function (x) considered in Example 11.8, the complex
Fourier coefficients are
1
k = 0,

2,

Z
Z

1
1
0,
0 6= k even,
ck =
(x) e i k x dx =
e i k x dx =

2
2 0

1 ,
k odd.
ik

Therefore, the step function has the complex Fourier series


(x)

1
i

l =

e(2 l+1) i x
.
2l + 1

The reader should convince themselves that this is exactly the same series as the real
Fourier series (11.36). We are merely rewriting it using complex exponentials instead of
real sines and cosines.
Example 11.15. Let us find the Fourier series for the exponential function e a x . It
is much easier to evaluate the integral for the complex Fourier coefficients, and so

Z
e(a i k) x
1
(a i k) x
ax
ikx
e
dx =
ck = h e ; e
i=
2
2 (a i k) x =
=

ea e a
(1)k (a + i k) sinh a
e(a i k) e (a i k)
= (1)k
=
.
2 (a i k)
2 (a i k)
(a2 + k 2 )

Therefore, the desired Fourier series is


e

ax

sinh a X

k =

(1)k (a + i k) i k x
e
.
a2 + k 2

(11.54)

As an exercise, the reader should try writing this as a real Fourier series, either by breaking
up the complex series into its real and imaginary parts, or by direct evaluation of the real
coefficients via their integral formulae (11.23).
The Delta Function
It is a remarkable, profound fact that Fourier analysis is completely compatible with
the calculus of generalized functions. The most important example, which demonstrates
that Fourier series can represent more general objects than mere functions, is the delta
function (x). Using its characterizing properties (10.35), the real Fourier coefficients are
computed as
Z
1
1
1
(x) cos k x dx = cos k 0 = ,
ak =

(11.55)
Z
1
1
(x) sin k x dx = sin k 0 = 0.
bk =

3/7/03

452

c 2003

Peter J. Olver

Therefore,

1
1
+
cos x + cos 2 x + cos 3 x + .
(11.56)
2
Since (x) is an even function, it should come as no surprise that it has a cosine series.
To understand in what sense this series converges to the delta function, it will help to
rewrite it in complex form
(x)

(x)

1 X
1
+ e2 i x + e i x + 1 + e i x + e2 i x + . (11.57)
eikx =
2
2
k =

where the complex Fourier coefficients are computed as


Z
1
1
ck =
.
(x) e i k x dx =
2
2
The nth partial sum
sn (x) =

n
1 X ikx
e
2
k = n

of the series can be explicitly computed, since it forms a geometric series


m
X

k=0

a rk = a + a r + a r2 + + a rm ,

(11.58)

with initial term a = e i nx , ratio r = e i x , and m = 2 n indicating the number of terms.


The well-known geometric summation formula

m+1
m
X
r
1
k
(11.59)
ar = a
r1
k=0

implies that

n
1 i nx e i (2n+1)x 1
1 X ikx
1 e i (n+1)x e i nx
e
=
sn (x) =
e
=
2
2
eix 1
2
eix 1
k = n
1
1

i n+ 2 x
i n+ 2 x
e
1 sin n + 12 x
1 e
=
.
(11.60)
=
1
1
2
2
sin 12 x
e 2 i x e 2 i x
To pass from the third to the penultimate expression, we multiplied numerator and denominator by e i x/2 ; then we used the formula (3.76) for the sine function in terms of
complex exponentials. Incidentally, (11.60) is the same as the n th real partial sum

i
1
1 h
1 sin n + 12 x
sn (x) =
+
cos x + cos 2 x + cos 3 x + + cos n x =
, (11.61)
2
2
sin 12 x

Or, we could use (11.53).

3/7/03

453

c 2003

Peter J. Olver

Figure 11.7.

Partial Fourier Sums Approximating the Delta Function.

which constitutes a remarkable trigonometric formula that is a bit more difficult to establish
directly.
Graphs of the partial sums sn (x) for various values of n are displayed in Figure 11.7.
Note that the spike, at x = 0, progressively becomes taller and thinner, converging to an
infinitely tall, infinitely thin delta spike. Indeed, by lHopitals Rule,

n + 21
1 n + 21 cos n + 21 x
1 sin n + 21 x
=
lim
=
as n .
lim
1
1
x0 2
x0 2

sin 12 x
2 cos 2 x
(An alternative, elementary proof of this fact is to note that, at x = 0, every term in the
original sum (11.57) is equal to 1.) Furthermore, the integrals remain fixed

Z
Z
Z X
n
sin n + 12 x
1
1
1
dx =
s (x) dx =
e i k x dx = 1,
(11.62)
2 n
2
2
sin 12 x
k = n

as also required for convergence to the delta function. However, away from the spike, the
sums do not go to zero. Rather, the entire function oscillates faster and faster, with an
overall amplitude of csc 12 x = 1/ sin 21 x. As n gets large, the amplitude function appears
as an envelope of the increasingly rapid oscillations. Roughly speaking, the fact that
sn (x) (x) as n means that the infinitely fast oscillations somehow cancel each
other out, and the net effect is zero away from the spike at x = 0. Thus, the convergence
of the Fourier sums to (x) is much more subtle than in the original limiting definition
(10.31). The technical term for this is weak convergence, which plays an very important
role in advanced mathematical analysis, [xxx].
Remark : Although we stated that the Fourier series (11.56), (11.57) represent the
delta function, this is not entirely correct. Remember that all Fourier series are periodic
3/7/03

454

c 2003

Peter J. Olver

of period 2 , and so the function they represent is also periodic of period 2 and
can be obtained by periodically extending the values of the function between and .
Therefore, (11.57) actually represents the periodic extension of the delta function
e
(x)
= +(x+4)+(x+2 )+(x)+(x2 )+(x4)+(x6)+ , (11.63)

consisting of a periodic array of delta spikes concentrated at all integer multiples of 2 .

11.3. Differentiation and Integration.


If a series of functions converges sufficiently nicely, then one expects to be able to
integrate and differentiate it term by term; the resulting series should converge to the
integral and derivative of the original sum. For power series, the theory is straightforward,
and used extensively in the construction of series solutions of differential equations, series
for integrals of non-elementary functions, and so on. Appendix C develops some of the
details.
As we now appreciate, the convergence of Fourier series is a much more delicate
proposition, and so one must take considerably more care in the application of termwise differentiation and integration. Nevertheless, in favorable situations, both operations
lead to valid results, and provide a powerful means of constructing Fourier series of more
complicated functions. Even more remarkably, the calculus of Fourier series is completely
compatible with the calculus of generalized functions that we developed in Chapter 10.
In particular, differentiating the Fourier series for a suitably nice function with a jump
discontinuity leads to the Fourier series for the differentiated function, with a delta function
of the appropriate magnitude appearing at the discontinuity. This fact reassures us that
the rather mysterious construction of delta functions and their generalizations is indeed
the correct method for extending calculus to functions which do not possess derivatives in
the ordinary sense.
Integration of Fourier Series
Integration is a smoothing operation the integrated function is always nicer than
the original function. Therefore, we would expect to be able to integrate Fourier series
without difficulty. However, there is a complication: the integral of a periodic function
is not necessarily periodic. The simplest example is the constant function 1, which is
certainly periodic, but its integral, namely x, is evidently not. On the other hand, the
integrals of the periodic sine and cosine functions appearing in the Fourier series are also
periodic. We conclude that only the constant term 21 a0 in a Fourier series (11.20) might
cause us difficulty when we try to integrate.
We begin by trying to integrate Fourier series with zero constant term:
Z
1
a0
=
f (x) dx = 0.
2
2

According to (2.4), 21 a0 is the mean or average of the function f (x) on the interval [ , ].
Therefore, a function has no constant term in its Fourier series if and only if it has zero
mean. It is easily shown that the mean zero functions are precisely the ones that remain
periodic upon integration.
3/7/03

455

c 2003

Peter J. Olver

Lemma 11.16. If f (x) is 2 periodic, then its integral g(x) =


periodic if and only if f has mean zero on the interval [ , ].

f (y) dy is 2
0

Remark : According to Lemma 11.11, odd functions automatically have mean zero.
Theorem 11.17. If f is piecewise continuous, 2 periodic, and has mean zero, then
its Fourier series can be integrated term by term, to produce the Fourier series
g(x) =

f (y) dy
0

m +

k=1

a
b
k cos k x + k sin k x
k
k

(11.64)

for its integral. The constant term


1
m=
2

g(x) dx

is the mean of the integrated function.


In many situations, the integration formula (11.64) provides a very convenient alternative to the direct derivation of the Fourier coefficients.
Z
Example 11.18. The function f (x) = x has mean zero,
x dx = 0, since it is

odd. Let us integrate its Fourier series

X
(1)k1
sin k x
x 2
k

(11.65)

k=1

that we found in Example 11.2. Applying the general formula (11.64), we find
1 2
x
2

X
2
(1)k1
2
cos k x
6
k2
k=1

cos 2 x
cos 3 x
cos 4 x
2 cos x
+

+ ,
6
4
9
16

(11.66)

where the constant term is found by integrating the left hand side:
1
2

2
x2
dx =
.
2
6

If we integrate each trigonometric summand in a Fourier series (11.20) from 0 to x,


we obtain
Z x
Z x
sin k x
1 cos k x
cos k y dy =
,
while
sin k y dy =
.
k
k
k
0
0
3/7/03

456

c 2003

Peter J. Olver

The constant terms 1/k arising from the sine integrals do not appear explicitly in (11.64),
and so must be hidden in the constant term m. We deduce that the mean value of the
integrated function can be computed using the Fourier sine coefficients of f via the formula
1
2

X
bk
.
g(x) dx = m =
k

(11.67)

k=1

For example, the result of integrating both sides of equation (11.65) from 0 to x is
x2
2

X
(1)k1
(1 cos k x).
2
k2
k=1

The constant terms sum up to yield the mean value of the integrated function:
Z 2

X
(1)k1
x
1
2
2
=
dx
=
.
k2
2 2
6

(11.68)

k=1

which is slightly different than (11.44).


If f (x) does not have mean zero, its Fourier series has a nonzero constant term,
f (x)

a0 X
[ ak cos k x + bk sin k x ] .
+
2
k=1

In this case, the result of integration will be


g(x) =

f (y) dy
0


X
a0
bk
ak

x+m+
cos k x +
sin k x ,
2
k
k

(11.69)

k=1

where m is given in (11.67). The right hand side is not, strictly speaking, a Fourier series.
There are two ways to validate this formula within the strict Fourier series framework.
Either we can write (11.69) as the Fourier series for the difference


X
bk
ak
a0

x m+
cos k x +
sin k x ,
g(x)
2
k
k

(11.70)

k=1

which is a 2 periodic function. Alternatively, one can replace x by its Fourier series
(11.25), and theZ result will be the Fourier series for the 2 periodic extension of the
x

integral g(x) =

f (y) dy.

Differentiation of Fourier Series


Differentiation has the opposite effect to integration. Differentiation makes a function
worse. Therefore, to justify taking the derivative of a Fourier series, we need to know that
the differentiated function remains reasonably nice. Since we need the derivative f 0 (x) to
be piecewise C1 for the convergence Theorem 11.7 to be applicable, we have to require
that f (x) itself be continuous and piecewise C2 .
3/7/03

457

c 2003

Peter J. Olver

Theorem 11.19. If f is 2 periodic, continuous, and piecewise C2 , then its Fourier


series can be differentiated term by term, to produce the Fourier series for the derivative
0

h(x) = f (x)

k=1

[ k bk cos k x k ak sin k x ] .

(11.71)

Example 11.20. If we differentiate the Fourier series (11.42) for f (x) = | x |, we


obtain

sin 5 x
sin 7 x
sin 3 x
4
0
+
+
+ .
(11.72)
sin x +
f (x)

3
5
7
The derivative (10.49) of the absolute value function is the sign function

+ 1,
x>0
d|x|
= sign x =
dx
1,
x < 0.
Note that sign x = (x)( x) is the difference of two step functions. Indeed, subtracting
the step function Fourier series (11.36) at x from the same series at x reproduces (11.72).
Example 11.21. If we differentiate the Fourier series

X
(1)k1
sin 2 x
sin 3 x
sin 4 x
x 2
sin k x = 2 sin x
+

+ .
k
2
3
4
k=1

for x, we obtain an apparent contradiction:


1 2

k=1

(1)k+1 cos k x = 2 2 cos x + 2 cos 2 x 2 cos 3 x + .

(11.73)

But the Fourier series for 1 just consists of a single constant term! (Why?)
The resolution of this difficulty is not hard. The Fourier series (11.25) does not
converge to x, but rather to its periodic extension fe(x), which has a jump discontinuity
of magnitude 2 at odd multiples of . Thus, Theorem 11.19 is not directly applicable.
Nevertheless, we can assign a consistent interpretation to the differentiated series. As
discussed in Section 10.2, the derivative fe0 (x) of the periodic extension is not equal to
the constant function 1, but, rather, has an additional delta function concentrated at each
jump discontinuity:
fe0 (x) = 1 2

j =

e ),
(x (2 j + 1)) = 1 2 (x

where e denotes the 2 periodic extension of the delta function, cf. (11.63). The differentiated Fourier series (11.73) does, in fact, converge to this modified distributional
derivative! Indeed, differentiation and integration of Fourier series is entirely compatible
with the calculus of generalized functions.
3/7/03

458

c 2003

Peter J. Olver

Finally, it is worth noting that one can equally well integrate and differentiate complex
Fourier series. Thus, under appropriate hypotheses, if
f (x)

ck e i k x ,

then

k =

f 0 (x)

i k ck e i k x .

k =

Integration is treated similarly; see Exercise .


An important consequence of the differentiation formulae is the fact that, the smoother
the function is, the faster its Fourier coefficients ak , bk decay to zero as k . For a
Fourier series (11.20) to converge to a piecewise continuous function, we must, at the very
least, have ak 0 and bk 0 as k ; see Lemma 11.35 below. If we assume that f (x)
is 2 periodic, continuous and piecewise C2 , then Theorem 11.19 implies that the Fourier
series for f 0 (x) converges, and so its Fourier coefficients, namely k bk and k ak , must
tend to zero as k . In general, if f is 2 periodic, has n 1 continuous derivatives
and nth derivative at least piecewise continuous C1 , then the Fourier coefficients of f (n) (x)
must tend to zero, which, by a simple induction, implies that
k n a k , k n bk

as

k .

In particular, this requires that the Fourier coefficients of f satisfy


C
C
(11.74)
,
| bk | < n ,
n
k
k
for some constant C > 0. If f is infinitely differentiable, or, even more restrictively,
analytic, then its Fourier coefficients go to zero faster than any power of k. For instance,
if | ak |, | bk | < C e k , then the Fourier sum is a C function. Thus, one can detect the
degree of smoothness of a function by looking at how rapidly its Fourier coefficients decay
to zero. See Theorem 11.30 below for a more precise result.
| ak | <

Example 11.22. The 2 periodic extension of the function | x | is continuous with


piecewise continuous first derivative. Its Fourier coefficients (11.41) satisfy the estimate
(11.74) for n = 2, which is in accord with the previous remarks. On the other hand, the
Fourier coefficients (11.24) of the step function (x) only tend to zero as 1/k, reflecting the
fact that its periodic extension is only piecewise continuous. Finally, the Fourier coefficients
(11.55) for the delta function do not tend to zero at all, indicative of the fact that it is not
an ordinary function, and its Fourier series does not converge in the standard sense.

11.4. Change of Scale.


So far, we have only dealt with Fourier series on a standard interval of length 2 . (We
chose [ , ], but the statements and formulas are easily adapted to any other interval
of the same length, e.g., [ 0, 2 ].) Since physical objects like bars and strings do not all
come in this particular length, we need to understand how to adapt the formulas when

If the function is not periodic, one must impose the assumptions on its periodic extension
for the remarks to be valid.

3/7/03

459

c 2003

Peter J. Olver

we are dealing with an interval of some other length. The basic idea already appears
in the discussion of adapting the orthogonal Legendre polynomials to other intervals in
Section 5.4, but will be repeated from scratch here.
Any symmetric interval [ ` , ` ] of length 2 ` can be rescaled to the standard interval
[ , ] by using the linear change of variables
y=

x,
`

so that

whenever

` x `.

(11.75)

Given a function f (x) defined on [ ` , ` ], the rescaled function

`
y
F (y) = f

lives on [ , ]. Let
F (y)

a0
+
ak cos k y + bk sin k y ,
2
k=1

be the standard Fourier series for F (y), so that


Z
Z
1
1
F (y) cos k y dy,
bk =
F (y) sin k y dy.
ak =


Then, reverting to the unscaled variable x, we deduce that


a0 X
kx
kx
f (x)
+
+ bk sin
.
ak cos
2
`
`

(11.76)

(11.77)

k=1

The Fourier coefficients of f (x) can be computed directly. Indeed, replacing the integration
variable by y = (/`) x, and noting that dy = (/`) dx, we deduce the modified formulas
1
ak =
`

1
bk =
`

kx
dx,
f (x) cos
`
`

f (x) sin
`

kx
dx,
`

(11.78)

for the Fourier coefficients of f .


All of the convergence results, integration and differentiation formulae, etc., that were
proved for the interval [ , ] carry over, essentially unchanged, to Fourier series on
nonstandard intervals. In particular, adapting our basic convergence Theorem 11.7, we
conclude that if f (x) is piecewise C1 , periodic of period 2 `, and takes on the midpoint
values at jump discontinuities, then the rescaled Fourier series (11.77) converges to it
everywhere.
Example 11.23. Let us compute the Fourier series for the function f (x) = x on the
interval 1 x 1. Since f is odd, only the sine coefficients will be nonzero. We have
1

Z 1
2(1)k+1
x cos k x sin k x
bk =
+
=
.
x sin k x dx =
k
(k )2 x = 1
k
1
3/7/03

460

c 2003

Peter J. Olver

The resulting Fourier series is

sin 3 x
2
sin 2 x
+

x
sin x

2
3

The series converges to the 2 periodic extension of the function x, namely

x 2 m,
2 m 1 < x < 2 m + 1,
fe(x) =
0,
x = m,

where m is an arbitrary integer.

We can also reformulate complex Fourier series on the nonstandard interval [ ` , ` ].


Scaling the variables in (11.51) in (11.75), we find
Z `

X
1
i k x/`
f (x) e i k x/` dx.
(11.79)
f (x)
ck e
,
where
ck =
2 ` `
k =

Again, this is just another way of writing the real Fourier series (11.77).
For a more general interval [ a, b ] there are two options. Either one can take a function
f (x) defined for a x b and periodically extend it to a function fe(x) that has period
b a. In particular, one can apply the same Fourier series (11.77) on the symmetric
interval [ 21 (a b), 12 (b a) ] of width 2 ` = b a. An alternative is to translate the interval
by an amount 21 (a + b) to make it symmetric; this is done by the change of variables
x
b = x 21 (a + b). The two methods are essentially equivalent, and details are left to the
reader.

11.5. Convergence of the Fourier Series.

In this final, optional section, we establish the basic convergence results for Fourier
series. This will be the most theoretical of the sections we encounter, but the results
are absolutely fundamental to the full range of applications of Fourier series methods.
Moreover, the techniques and proofs serve as a good introduction to some of the important
tools of advanced mathematical analysis. A firm understanding of the range and the
limitations of the Fourier analysis requires some familiarity with the basic theoretical
developments. However, the reader who is exclusively interested in simple applications
can safely proceed to the subsequent chapters.
Unlike power series which converge to analytic functions on the interval of convergence,
and diverge elsewhere (the only tricky point being whether or not the series converges at
the endpoints), the convergence of Fourier series is a much more subtle matter. Indeed,
some of the more detailed convergence issues remain unresolved to this day. A large part
of the difficulty stems from the intricacies of convergence in infinite-dimensional function
spaces.
Convergence in Vector Spaces
In a finite-dimensional vector space, e.g., R n , convergence of sequences, and hence
series, is straightforward there is essentially only one way for a sequence of vectors
v(k) R n to converge. Convergence is governed by any of the following equivalent criteria:
3/7/03

461

c 2003

Peter J. Olver

(a) The vectors converge: v(k) v? R n .

(k)

(k)

(b) The individual components of v (k) = (v1 , . . . , vn(k) ) converge, so vi vi? , for
i = 1, . . . , n.
(c) The difference in norms goes to zero: k v (k) v? k 0.
The last requirement, known as convergence in norm, does not, in fact, depend on which
norm is chosen. According to Theorem 3.19, on a finite-dimensional vector space, all norms
are essentially equivalent, and if one norm goes to zero, so does any other norm.
Convergence criteria are most definitely not the equivalent in an infinite-dimensional
vector space. There are many different ways in which a sequence of functions v k (x) may
converge. The simplest of these is pointwise convergence, where we require
lim vk (x) = v ? (x)

for all

x.

(11.80)

In other words, the functions values at each individual point in their domain of definition
converge in the usual sense. Pointwise convergence is the function space counterpart of
convergence of the components of a vector. Indeed, if we replace a function v(x) by its
T
sample vector v = ( v(x1 ), . . . , v(xn ) ) , then pointwise convergence of the functions implies
component-wise convergence of the corresponding sample vectors.
On the other hand, convergence in norm of the sequence requires
lim k vk v ? k = 0,

where k k is a fixed function space norm. Not all norms on an infinite-dimensional function
space are equivalent: a function can be small in one norm, but large in another. As a result,
convergence in norm will depend upon the choice of norm. Moreover, convergence in norm
does not necessarily imply pointwise convergence or vice versa. Examples can be found in
the exercises.
There are, in fact, a bewildering variety of different convergence mechanisms in function space, including convergence in norm, uniform convergence, pointwise convergence,
weak convergence, and so on. All play a significant role in advanced mathematical analysis. For our applications, we shall be content to study just the most basic aspects of
convergence of the Fourier series. Much more detail is available in more advanced texts,
e.g., [42, 132].
Uniform Convergence
Proving uniform convergence of a Fourier series is reasonably straightforward, and
so we will begin there. The student has probably already encountered, albeit perhaps
confusingly, the basic definition of uniform convergence of functions. For the record, we
restate it here.
Definition 11.24. A sequence of functions vk (x) is said to converge uniformly to
v ? (x) on a subset I R if, for every > 0, there exists an integer n such that
| vk (x) v ? (x) | <
3/7/03

for all x I and all k n.


462

(11.81)
c 2003

Peter J. Olver

The key point and the reason for the term uniform convergence is that the
integer n depends only upon and not on the point x I. Functions can converge
pointwise, but not uniformly; the Gibbs phenomenon in Figure 11.5 is the prototypical
example of a nonuniformly convergent sequence. For a given > 0, the size of the integer
n required for (11.81) to hold will be larger and larger the closer the point x is to the
discontinuity. Thus, for a given , there is no uniformly valid n that fulfills the requirement
(11.81) for all points x. A detailed discussion of these issues, including the proofs of the
following basic theorems, can be found, for instance, in [9].
A key consequence of uniform convergence is that it preserves continuity.
Theorem 11.25. If uk (x) u? (x) converges uniformly, and each uk (x) is continuous, then u? (x) is also a continuous function.
The convergence of a series
of their partial sums

uk (x) is governed by the convergence of the sequence

k=1

vn (x) =

n
X

uk (x).

(11.82)

k=1

The most useful test for uniform convergence of series of functions is known as the Weierstrass M test, due to the nineteenth century German Karl Weierstrass, known as the
father of modern analysis.
Theorem 11.26. Suppose the functions uk (x) satisfy
| uk (x) | mk

for all

x I,

(11.83)

where the mk 0 are fixed positive constants. If the series

mk <

(11.84)

uk (x) = f (x)

(11.85)

k=1

converges, then the series

k=1

converges uniformly to a function f (x) for all x I. In particular, if the summands u k (x)
in Theorem 11.26 are continuous, so is the sum f (x).
With a little care, we are allowed to manipulate uniformly convergent series just like
finite sums. Thus, if (11.85) is a uniformly convergent series, so is the term-wise product

g(x) uk (x) = g(x) f (x)

(11.86)

k=1

3/7/03

463

c 2003

Peter J. Olver

provided | g(x) | C is a bounded function for x I. We can also integrate a uniformly


convergent series term by term, and the integrated series
!
Z x
Z xX

Z x
X
f (y) dy
(11.87)
uk (y) dy =
uk (y) dy =
a

k=1

k=1

is uniformly convergent. Differentiation is also allowed but only if the differentiated


series converges uniformly.
Proposition 11.27. If

u0k (x) = g(x)

(11.88)

k=1

is a uniformly convergent series, then

uk (x) = f (x)

k=1

is also uniformly convergent, and, moreover, f 0 (x) = g(x).


We are particularly interested in applying these results to the Fourier series, which,
for convenience, we take in its complex form
f (x)
Since x is real,

ck e i k x .

(11.89)

k =

ikx
e
1,

and hence the individual summands are bounded by

c eikx | c |
for all
k
k

x.

Applying the Weierstrass M test, we immediately deduce the basic result on uniform
convergence of Fourier series.
Theorem 11.28. If the Fourier coefficients ck satisfy

k =

| ck | < ,

(11.90)

then the Fourier series (11.89) converges uniformly to a continuous function f (x), with c k
equal to the Fourier coefficients of the sum.
Proof : Uniform convergence follows from Theorem 11.26, and continuity is a consequence of Theorem 11.25. To show that the ck actually are the Fourier coefficients of the
sum, we multiply the Fourier series by e i kx and integrate term by term from to .
As noted in (11.86), (11.87), both operations are valid due to the uniform convergence of
the series.
Q.E.D.
3/7/03

464

c 2003

Peter J. Olver

The one thing that the theorem does not guarantee is that the original function f (x)
used to compute the Fourier coefficients ck is the same as the function fe(x) obtained by
summing the resulting Fourier series! Indeed, this may very well not be the case. As we
know, the function that the series converges to is necessarily 2 periodic. Thus, at the
very least, fe(x) will be the 2 periodic extension of f (x). But even this may not suffice.
If two functions f (x) = fb(x) are the same except on a finite set of points x1 , . . . , xm ,
then they have the same Fourier coefficients. More generally, two functions which agree
everywhere outside a set of measure zero will have the same Fourier coefficients. In this way,
a convergent Fourier series distinguishes one among a collection of essentially equivalent
functions.
Here measure is the rigorous generalization of the notion of the length of an interval
to more general subsets S R. In particular, S has measure zero if it can be covered by a
collection of intervals of arbitrarily small total length. For example, any collection of finitely
many points, or even countably many points, e.g., the rational numbers, has measure zero.
The proper development of the notion of measure, and the resulting Lebesgue theory of
integration, can be learned in a course in real analysis, [105], and will only be touched
upon here.
We further note, without proof, the following general result on uniform convergence
of the Fourier series on subintervals; see [22, 42, 132] for details.
Theorem 11.29. Let f (x) be 2 periodic and piecewise C1 . If f is continuous for
a < x < b, then its Fourier series converges uniformly to f (x) on any closed subinterval
a + x b , with > 0.
Thus, the Fourier series (11.36) for the step function does converge uniformly if we
stay away from the discontinuities; for instance, by restriction to a subinterval of the
form [ , ] or [ + , ] for any 0 < < 12 . This confirms our observation
that the nonuniform Gibbs phenomenon is progressively more and more localized near the
discontinuities.
Smoothness and Decay
The uniform convergence criterion (11.90) requires, at the very least, that the Fourier
coefficients decay to zero: ck 0 as k . In fact, the coefficients cannot tend to zero
too slowly. For example, the individual summands of the infinite series

k =

1
| k |

(11.91)

go to 0 as k for all > 0, but the series converges if and only if > 1. (This result
is a consequence of the standard integral test for series, [9].) Thus, if we can bound the
Fourier coefficients by
| ck |

M
| k |

for all

| k | 0,

(11.92)

for some power > 1 and some positive constant M > 0, then the Weierstrass M test will
guarantee that the Fourier series converges uniformly to a continuous function.
3/7/03

465

c 2003

Peter J. Olver

An important consequence of the differentiation formulae (11.71) for Fourier series is


the fact that the faster the Fourier coefficients of a function tend to zero as k , the
smoother the function is. Thus, one can detect the degree of smoothness of a function by
looking at how rapidly its Fourier coefficients decay to zero. More rigorously, we have:
Theorem 11.30. If the Fourier coefficients satisfy

k=1

a2k

b2k

k =

k n | ck | < ,

(11.93)

then the Fourier series converges to a 2 periodic function which is n times continuously
differentiable: f (x) Cn . Moreover, for any k n, the k times differentiated Fourier
series converges uniformly to the corresponding derivative f (k) (x).
Proof : This is an immediate consequence of Proposition 11.27. Application of the
Weierstrass M test to the differentiated Fourier series and use of (11.93) completes the
proof.
Q.E.D.
Corollary 11.31. If the Fourier coefficients satisfy (11.92) for some > n + 1, then
the function f (x) is n times continuously differentiable.
Thus, the faster the Fourier coefficients go to zero at large frequency k, the smoother
the function is. If the Fourier coefficients go to zero faster than any power of k, e.g.,
exponentially fast, then the function is infinitely differentiable. Analyticity is a little more
delicate, and we refer the reader to [132] for details.
Hilbert Space
In order to make further progress, we need to take a little detour. It turns out the
proper setting for the rigorous theory of Fourier series is the most important function
space in modern physics and analysis, known as Hilbert space. The precise definition of
this infinite-dimensional inner product space is rather technical, but a rough version goes
as follows:
Definition 11.32. A complex-valued function f (x) defined for all x is
called square-integrable if it satisfies
Z
1
2
kf k =
| f (x) |2 dx < .
(11.94)
2
The Hilbert space L2 = L2 [ , ] is the vector space consisting of all complex-valued
square-integrable functions on the interval [ , ].

Note that (11.94) is the fundamental L2 norm based on the Hermitian inner product
Z
1
f (x) g(x) dx.
(11.95)
hf ;gi =
2

The triangle inequality (3.16), which is


kcf + dgk |c| kf k + |d| kgk
3/7/03

466

c 2003

Peter J. Olver

implies that the Hilbert space is a complex vector space, i.e., if f, g L 2 , so is any linear
combination c f + d g. The CauchySchwarz inequality (3.13),
|hf ;gi| kf k kgk
implies that the inner product of two square-integrable functions is well-defined and finite.
In particular, the Fourier coefficients of a function f (x) are defined as inner products
Z
1
ikx
ck = h f ; e
i=
f (x) e i k x dx
2
of f with the complex exponentials, and hence are well-defined for any f L 2 .
There are some interesting analytical subtleties that arise when one tries to prescribe
precisely which functions are to be admitted to Hilbert space. In particular, every piecewise continuous function belongs to the Hilbert space L2 . but certain singularities are
also allowed. For example, xr belongs to L2 for any r < 21 , but not if r 12 . However,
analysis requires limiting procedures, and the Hilbert space must be complete in the
sense that appropriately convergent sequences of functions have a limit. The completeness requirement relies on the development of the more sophisticated Lebesgue theory of
integration, which was formalized in the early part of the twentieth century by the French
mathematician Henri Lebesgue and just in time for quantum mechanics! Any function
which is square-integrable in the Lebesgue sense is admitted into L2 . This includes such
1
non-piecewise continuous functions as sin and x1/4 , as well as the strange function
x

1
if x is a rational number,
r(x) =
(11.96)
0
if x is irrational.
One soon discovers that general square-integrable functions can be quite bizarre.
A second complication is that (11.94) does not, strictly speaking, define a norm once
we allow discontinuous functions. For example, the piecewise continuous function

1,
x = 0,
(11.97)
f0 (x) =
0,
x 6= 0,
has norm zero, k f0 k = 0, even though it is not zero everywhere. Indeed, any function
which is zero except on a set of measure zero also has norm zero, including the function
(11.96). Therefore, in order to make (11.94) into a legitimate norm on Hilbert space, we
must agree to identify any two functions which have the same values except on a set of
measure zero. Thus, the preceding examples f0 (x) and r(x) are both considered as being
identical to the zero function in Hilbert space. Thus, although we treat them as if they
were ordinary functions, each element of Hilbert space is not, in fact, a function, but,
rather, an equivalence class of functions all differing on a set of measure zero. All this

The proper technical construction is to require that every Cauchy sequence of functions
uk (x) L2 , meaning a sequence with the property that k un um k 0 whenever both n, m
, must converge: uk u? L2 .

3/7/03

467

c 2003

Peter J. Olver

might strike the reader as becoming much too abstract and arcane. In fact, the casual
reader will not lose much by assuming that the functions in L2 are always piecewise
continuous and square-integrable. Nevertheless, the full analytical power of Hilbert space
theory is only unleashed by allowing much more general kinds of functions into the fold.
After its invention by David Hilbert around the turn of the twentieth century, physicists in the 1920s suddenly realized that Hilbert space was the correct setting to establish
the modern theory of quantum mechanics. A quantum mechanical wave function is a element L2 that has unit norm: k k = 1. Thus, the set of wave functions is merely
the unit sphere in Hilbert space. In quantum mechanics, a wave function is endowed with
a probabilistic interpretation. The modulus | (x) | of the wave function at a position
x quantifies the probability of finding the corresponding particle (photon, electron, etc.)
there.
correctly, the probability that the particle resides in an interval [ a, b ] is equal
s More
Z b
1
to
| (x) |2 dx. In particular, the wave function has unit norm
2 a
s
Z
1
| (x) |2 dx = 1
kk =
2
because the particle must certainly, i.e., with probability 1, be somewhere!
Convergence in Norm
We are now in a position to discuss convergence in norm of the Fourier series. We
begin with the basic definition, which makes sense for any inner product space.
Definition 11.33. Let V be an inner product space. A sequence vn V is said to
converge in norm to w V if k vn w k 0 as n .
Remark : Convergence in norm is very different from pointwise convergence. For instance, it is possible, Exercise , to construct a sequence of functions that converges in
norm to 0, but does not converge pointwise anywhere!
We are particularly interested in the convergence in norm of the Fourier series of
square integrable functions. Let
sn (x) =

n
X

ck e i k x

(11.98)

k = n

be the nth partial sum of the Fourier series for the function f (x) L2 . The partial sum
(11.98) is an element of the subspace T (n) L2 consisting of all trigonometric polynomials
of degree at most n, cf. Example 2.12. It is, in fact, a distinguished element of this
subspace namely, it is the closest function in T (n) to f L2 , where the distance

Here we are considering the physical space to be represented by the one-dimensional interval [ , ]. The more physically relevant case of three-dimensional space is treated similarly,
replacing the single integral by a triple integral over all of R 3 .

3/7/03

468

c 2003

Peter J. Olver

between functions is measured by the L2 norm of there difference: k f g k! Thus, in the


language of Chapter 4, the Fourier partial sum sn (x) is the best trigonometric polynomial
approximation to the given function f (x) in the least squares sense. This important
characterization of the Fourier partial sums is, in fact, an immediate consequence of the
orthonormality of the trigonometric basis.
Theorem 11.34. The nth order Fourier partial sum sn (x) is the closest approximation to f (x) in the space of trigonometric polynomials T (n) , meaning that it minimizes the
L2 norm of the difference
Z
1
2
| f (x) pn (x) |2 dx
(11.99)
k f pn k =

among all possible degree n trigonometric polynomials
pn (x) =

n
X

k = n

dk e i k x T (n) .

(11.100)

Proof : The proof is, in fact, a function space version of the proof of the finitedimensional Theorems 5.33 and 5.35. Note first that, owing to the orthonormality (11.50)
of the basis exponentials, we can compute the norm of a trigonometric polynomial (11.100)
by summing the squared moduli of its Fourier coefficients:
k p n k2 = h p n ; p n i =

n
X

k,l = n

dk dl h e i k x ; e i l x i =

k = n

see also (5.6). Therefore, we can compute


k f p n k2 = k f k 2 2 h f ; p n i + k p n k2 = k f k 2 2
2

= kf k 2

n
X

c k dk +

n
X

k = n

k = n

| dk | = k f k

n
X

k = n
n
X

k = n

n
X

dk h f ; e i k x i + k p n k 2
2

| ck | +

The last equality results from adding and subtracting the norm
k sn k2 =
of the Fourier partial sum. Therefore,
2

n
X

k = n

| dk |2 ;

n
X

k = n

| dk c k |2

| c k |2

k f p n k = k f k k sn k +

n
X

k = n

| dk c k |2 .

The first and second terms in the right hand expression are determined by f (x) and hence
cannot be altered by the choice of trigonometric polynomial pn (x), which only affects the
final summation. Since it is a sum of nonnegative quantities, the sum is, in fact, minimized
by setting all the summands to zero, i.e., setting dk = ck . We conclude that k f pn k is
minimized if and only if dk = ck are the Fourier coefficients, and hence pn (x) = sn (x) is
the Fourier partial sum.
Q.E.D.
3/7/03

469

c 2003

Peter J. Olver

Setting pn = sn in the preceding formula, so dk = ck , we conclude that


2

k f sn k = k f k k sn k = k f k

n
X

k = n

| c k |2 .

(11.101)

Now, the left hand side of this equality is always non-negative


k f sn k2 0.
Applying this inequality to the right hand side, we conclude that the Fourier coefficients
of the function f must satisfy the basic inequality
n
X

k = n

| c k |2 k f k 2 .

Since we are summing a sequence of non-negative numbers whose partial sums are uniformly bounded, the limiting summation as n will exist and also be bounded by the
right hand side. We have thus proved Bessels inequality

k =

| c k |2 k f k 2 .

(11.102)

As before, if a series is to converge, the individual summands must go to zero: | c k |2 0.


We therefore deduce an immediate corollary an easy form of the RiemannLebesgue
Lemma.
Lemma 11.35. If f L2 is square integrable, then its Fourier coefficients satisfy
Z
1
f (x) e i k x dx 0,
as
| k | .
ck =
2
This is equivalent to the convergence of the
Z
1
ak =
f (x) cos k x dx

Z
1
f (x) sin k x dx
bk =

real Fourier coefficients

0,
as
k .

Remark : As before, the convergence of the sum (11.102) requires that the coefficients
ck cannot tend to zero too slowly. For instance, if ck satisfies the power bound
| ck | M | k | ,

then

k =

| c k |2 <

provided

> 21 .

Uniform convergence required > 1, cf. (11.92), and hence convergence in norm imposes
a less restrictive assumption on the Fourier coefficients. Indeed, the Fourier series may
very well converge in norm to a discontinuous function, which is not possible for uniform
convergence.
3/7/03

470

c 2003

Peter J. Olver

In fact, there are some bizarre continuous functions whose Fourier series do not converge uniformly, failing to converge at all at some points. A deep result says that the
Fourier series of a continuous function converges except possibly on a set of measure zero,
[132]. Again, the subtle details of the convergence of Fourier series are rather delicate, and
lack of space and analytical tools prevents us from delving any further into these issues.
Completeness
As we know, specification of a basis allows one to describe all elements of a finitedimensional vector. The number of basis elements equals the dimension of the vector
space. For an infinite-dimensional vector space, there are, by definition, infinitely many
linearly independent elements, and no finite collection can serve as a basis. The question
then arises to what extent an infinite collection of linearly independent elements can be
considered as a basis for the space. Mere counting will no longer suffice, since omitting
one or two or any finite number, or even some infinite subcollections from a supposed
basis will still leave infinitely many linearly independent elements of the vector space, but
clearly the reduced collection should, in some sense, no longer serve to define a basis. The
curse of infinity strikes again! For example, while the complete trigonometric collection
1, cos k x, sin k x for k = 1, 2, 3, . . . can represent any 2 periodic L 2 function as a Fourier
series, the subcollection cos k x, sin k x will only represent functions with mean zero, while
the functions sin k x only represent odd functions. All three collections have infinitely many
elements, but only the first can be properly called a basis. In general, just because we have
found a infinite collection of independent elements, how do we know that we have enough,
and are not missing one or two or 10,000 or even infinitely many additional elements?
The concept of completeness serves to properly formalize the notion of a basis
of an infinite-dimensional vector space. We shall discuss completeness in a general, abstract setting, but the key example is, of course, the Hilbert space L2 and the system of
trigonometric or complex exponential functions forming a Fourier series. Other important examples arising in later applications include Bessel functions, Legendre polynomials,
spherical harmonics, and general systems of eigenfunctions of self-adjoint boundary value
problems. For simplicity, we only define completeness in the case of orthonormal systems.
Similar arguments will clearly apply to orthogonal systems, but the additional normality
condition helps to simplify the formulae.
Let V be an infinite-dimensional inner product space. Suppose that u1 , u2 , u3 , . . . V
form an orthonormal collection of elements of V , so

1
i = j,
(11.103)
h ui ; u j i =
0,
i 6= j.
A straightforward argument proves that the ui are linearly independent; see Proposition 5.4. Given a general element f V , we form its generalized Fourier series
f

c k uk ,

where

k=1

ck = h f ; uk i.

(11.104)

The Fourier coefficients ck are given by our usual orthonormal basis formula (5.5), which
is obtained by formally taking the inner product of the series with u k .
3/7/03

471

c 2003

Peter J. Olver

Definition 11.36. An orthonormal system of elements u1 , u2 , u3 , . . . V is called


complete if the generalized Fourier series (11.104) of any f V converges in norm to f .
In other words,
n
X
k f sn k 0
where
sn =
c k uk ,
(11.105)
k=1

is the nth partial sum of the generalized Fourier series (11.104).


Thus, completeness requires that every element can be arbitrarily closely approximated (in norm) by a suitable linear combination of the basis elements. A complete orthonormal system should be viewed as the infinite-dimensional version of an orthonormal
basis of a finite-dimensional vector space.
The key result for Fourier series is that the complex exponentials, or, equivalently the
trigonometric functions, form a complete system.
Theorem 11.37. The complex exponentials e i k x , k = 0, 1, 2, . . ., form a complete orthonormal system in L2 [ , ]. In other words, if sn (x) denotes the nth partial
sum (11.105) of the Fourier series of the square-integrable function f (x) L 2 [ , ], then
lim k f sn k = 0.

(11.106)

An indication of the proof of this result will apppear below.


Remark : Theorem 11.37 is, in fact, a particular case of a theorem that governs eigenfunction expansions arising from quite general positive definite boundary value problems.
In order to understand this result, let us first describe some equivalent characterizations of completeness. The Plancherel formula is the infinite-dimensional counterpart of
our formula (5.6) for the norm of a vector in terms of its coordinates with respect to an
orthonormal basis.
Theorem 11.38. The orthonormal system of elements u1 , u2 , u3 , . . . V is complete
if and only if the Plancherel formula
k f k2 =

k =

| c k |2

where

ck = h f ; uk i,

(11.107)

holds for every f V .


Proof : We begin by computing the Hermitian norm
k f sn k2 = k f k2 h f ; sn i h sn ; f i + k sn k2 = k f k2 2 Re h f ; sn i + k sn k2 .

We are in essence repeating the proofs of Theorem 11.34 and the subsequent trigonometric
Bessel inequality (11.102) in an abstract setting.

3/7/03

472

c 2003

Peter J. Olver

Substituting the formula sn =

n
X

ck uk for the partial sums, we find, by orthonormality,

k=1

k sn k =

n
X

k=1

| ck | ,

while

h f ; sn i =

n
X

k=1

ck h f ; u k i =

n
X

k=1

| c k |2 .

Therefore,
2

0 k f sn k = k f k

n
X

k=1

| c k |2 .

(11.108)

The fact that the left hand side of (11.109) is non-negative for all n, implies the general
Bessel inequality,

X
| c k |2 ,
(11.109)
k f k2
k=1

which is valid for any orthonormal system of elements in an inner product space. As we
noted above, Bessels inequality implies that the generalized Fourier coefficients c k 0
must tend to zero reasonably rapidly in order that the sum of their squares converges.
Plancherels formula (11.107), thus, states that, if the system of functions is complete,
the Bessel inequality is, in fact, an equality! Indeed, letting n in (11.108), we have
lim k f sn k2 = k f k2

k=1

| c k |2 .

Therefore, the completeness condition (11.106) holds if and only if the right hand side
vanishes, which is the Plancherel identity (11.107).
Q.E.D.
Corollary 11.39. If ck , dk are the Fourier coefficients of f, g, respectively, with
respect to a complete orthonormal system, then they satisfy Parsevals identity
hf ;gi =

c k dk .

(11.110)

k =

Proof : If we write out the Plancherel formula (11.107) for the sum f + g of two
functions, we find
2

kf k + 2hf ;gi + kgk = kf + gk =


=

k =

k =

| c k + dk |2
2

| ck | + 2

k =

c k dk +

k =

| dk |2 .

Subtracting off the Plancherel equations for f and for g reduces this to Parsevals formula
(11.110).
Q.E.D.
3/7/03

473

c 2003

Peter J. Olver

In particular, in the case of the complex exponential basis of L2 [ , ], the Plancherel


and Parseval formulae tell us that
Z
Z

X
X
1
1
2
2
| f (x) | dx =
f (x) g(x) dx =
| ck | ,
ck dk , (11.111)
2
2
k =

k =

in which ck , dk are, respectively, the ordinary Fourier coefficients of the complex-valued


functions f (x) and g(x). Note that the Plancherel formula is a special case of the Parseval
identity (11.110) obtained by setting f = g. In Exercise , the reader is asked to rewrite
the two formulas in terms of the real Fourier coefficients.
Completeness also tells us that a function is uniquely determined by its Fourier coefficients.

Proposition 11.40. If the orthonormal system u1 , u2 , . . . V is complete, then the


only element f V with all zero Fourier coefficients: ck = 0 for all k = 1, 2, 3, . . ., is
the zero element, f = 0. More generally, two elements f, g V have the same Fourier
coefficients if and only if they are the same: f = g.
Proof : The proof is an immediate consequence of the Plancherel formula. Indeed, if
ck = 0, then (11.107) impies that k f k = 0. The second statement follows by applying the
first to the function f g.
Q.E.D.
Another way of stating this result is that the only function which is orthogonal to
every element of a complete orthonormal system is the zero function. Thus, a complete
orthonormal system is maximal in the senhse that no further orthonormal elements can
be appended to it.
Proof of Theorem 11.37 : We shall prove the completeness criterion only for continuous functions, leaving the harder general proof to the references, [132]. According
to Theorem 11.28, if f (x) is continuous, 2 periodic, and piecewise C 1 , its Fourier series
converges uniformly to f (x), so
f (x) =

ck e i k x

k =

for all

x .

We are allowed to multiply and integrate uniformly convergent series term by term, and
hence

X
X
ikx
2
ck e
=
ck f (x) e i k x .
| f (x) | = f (x) f (x) = f (x)
k =

k =

Integrating the result from to produces


Z
Z

X
X
X
1
1
2
ikx
2
| f (x) | dx =
c f (x) e
dx =
kf k =
ck ck =
| c k |2 .
2
2 k
k =

k =

k =

Therefore, Plancherels identity (11.107) holds for any continuous function. With some
additional technical work, this result is used to establish the validity of Plancherels formula
for all f L2 , the key step being to suitably approximate f by continuous functions. With
this in hand, completeness follows from Theorem 11.38.
Q.E.D.
3/7/03

474

c 2003

Peter J. Olver

Pointwise Convergence
Let us finally turn to the proof of the Pointwise Convergence Theorem 11.7. The goal
is to prove that, under the appropriate hypotheses, the limit of the partial Fourier sums is

(11.112)
lim sn (x) = 21 f (x+ ) + f (x ) .
n

We begin by substituting the formulae (11.52) for the complex Fourier coefficients into the
formula (11.98) for the nth partial sum:

Z
n
n
X
X
1
ikx
iky
sn (x) =
ck e
=
f (y) e
dy e i k x
2
k = n
k = n
Z
n
X
1
=
e i k(xy) dy.
f (y)
2
k = n

We can then use the geometric summation formula (11.60) to evaluate the result:

Z
sin n + 21 (x y)
1
dy
sn (x) =
f (y)
2
sin 21 (x y)

Z x+
Z
sin n + 12 y
sin n + 12 y
1
1
f (x + y)
f (x + y)
=
dy =
dy.
2 x
2
sin 21 y
sin 12 y

The next to last equality comes from changing variable in the integral from y to x + y.
The final equality comes from the fact that the integrand is 2 periodic, and so its integral
over any interval of length 2 has the same value; see Exercise .
Thus, to prove (11.112), it suffices to show that

Z
sin n + 21 y
1
dy = f (x+ ),
lim
f (x + y)
1
n
sin 2 y
0

(11.113)
Z 0
sin n + 21 y
1
lim
dy = f (x ).
f (x + y)
n
sin 21 y

The proofs of the two formulae are identical, and so we concentrate on the first. Equation (11.62) implies that

Z
Z X
n
sin n + 12 y
1
1
dy =
e i k y dy = 1.
1
0
0
sin 2 y
k = n

Multiplying the right hand side of the first equation in (11.113) by the integral allows us
to rewrite it in the form
Z

f (x + y) f (x+ )
1
sin n + 12 y dy = 0.
lim
(11.114)
1
n
sin 2 y
0
We claim that, for each fixed value of x, the function
g(y) =
3/7/03

f (x + y) f (x+ )
sin 12 y
475

c 2003

Peter J. Olver

is piecewise continuous for all 0 y . Owing to our hypothesis on f (x), the only
problematic point is when y = 0, but then
lim g(y) = lim

y 0+

y 0+

y
f (x + y) f (x+ )
= 2 f 0 (x+ )
y
sin 21 y

is twice the right hand derivative of f at x. The factor of 2 comes from the elementary
calculus limit
1
y
2 y
lim+
=
2
lim
= 2.
1
y0
y 0+ sin y
sin 12 y
2
Thus, formula (11.114) will follow if we can show that
Z

1
lim
g(y) sin n + 21 y dy = 0
n
0

(11.115)

for any piecewise continuous function g. Were it not for the extra 21 , this would immediately

follow from Lemma 11.35. More honestly, we use the addition formula for sin n + 21 y to
write
Z
Z
Z

1
1
1
1
1
g(y) sin 2 y cos n y dy +
g(y) cos 12 y sin n y dy
g(y) sin n + 2 y dy =
0
0
0
The first integral is the Fourier cosine coefficient e
an for the piecewise continuous function
1
g(y) sin 2 y, while the second integral is the Fourier sine coefficient ebn for the piecewise
continuous function g(y) cos 21 y. Lemma 11.35 implies that both of these converge to zero
as n , and hence (11.115) holds. This completes the proof.
Q.E.D.
Remark : An alternative approach to the last part of the proof is to use the following
general version RiemannLebesgue Lemma.
Lemma 11.41. Suppose g(x) is piecewise continuous on [ a, b ]. Then
Z b
g(x) e i x dx = 0.
lim

Intuitively, the lemma says that, as the frequency gets larger and larger, the increasingly rapid oscillations in sin x tend to cancel each other out. A formal proof of the
lemma from first principles can be found in [42, 132].
This concludes our brief foray into the theory underlying the Fourier series. Further
details can be found in a variety of references, including [22, 42, 72, 132].

This follows from lH


opitals rule, but in point of fact this is a fake application: one needs
to evaluate the limit when proving the formula for the derivative of the sine function, which is in
turn required for applying the lH
opital rule.

3/7/03

476

c 2003

Peter J. Olver

Chapter 13
Vibration and Diffusion
in OneDimensional Media
In this chapter, we study the solutions, both analytical and numerical, to the two
most important equations of one-dimensional continuum dynamics. The heat equation
describes the diffusion of thermal energy in a solid body; here we analyze the case of a
one-dimensional solid bar. The wave equation describes vibrations and waves in continuous
media, including sound waves, water waves, elastic waves, electromagnetic waves, and so
on. Again, we restrict our attention to the case of waves in a one-dimensional medium,
e.g., a string, or a bar, or a column of air. Multi-dimensional versions of these fundamental
equations will be treated in Chapters 16 and 17.
As we saw in Section 11.1, the basic solution technique is modeled on our methods
for solving linear systems of ordinary differential equations. Substituting an exponential
or trigonometric ansatz reduces the system to a self-adjoint boundary value problem. The
general solution to the partial differential equation can be expressed as a infinite series in
the eigenfunction solutions to the underlying boundary value problem. In the particular
cases considered here, the eigenfunctions are trigonometric, and thus the solution to the
partial differential equation is expressed in the form of a time-dependent Fourier series.
Although we cannot, in general, sum the infinite series to produce a closed form formula
for the solution, there are a number of useful observations that can be gleaned from this
representation.
In the case of the heat equation, the solutions decay exponentially fast to thermal
equilibrium, at a rate governed by the first eigenvalue of the associated boundary value
problem. The higher order Fourier modes damp out very rapidly, which makes the heat
equation a means of automatically smoothing and denoising signals and images. In the
case of the wave equation, each Fourier mode vibrates with a natural frequency, and the
full solution is a linear combination of these fundamental vibrations. For one-dimensional
media, the vibrational frequencies are integral multiples of a single lowest frequency, which
explains the musical qualties of strong and wind instruments. An alternative solution
technique for the one-dimensional wave equation, due to dAlembert, leads to an explicit
formula for the solution that points out the role of characteristics for signal propagation and
the behavior of solutions. Both the explicit and series solution methods are useful, and shed
complementary lights on the physical phenomena of vibration. In certain cases, a version
of the Greens function, known as the fundamental solution, can be used to construct
integral representations of the solution, including the solution to an inhomogeneous version
of the partial differential equation when subjected to external forcing. We will also show
3/7/03

531

c 2003

Peter J. Olver

how to exploit the symmetry properties of the differential equation in order to construct
new solutions from known solutions.
Finally, several basic numerical solution methods for both the heat and the wave
equation are presented and analyzed. We begin with a short discussion of basic finite
difference approximations arising in numerical differentiation. Substituting the numerical
differentiation formulae into the partial differential equation leads to a potential solution
scheme, albeit one that is not necessarily guaranteed to reproduce the analytic solution.
Further analysis is required to sort o;ut which schemes satisfy the convergence criteria
for producing bona fide numerical approximations to the solution. All of the numerical
solution methods reduce to some form of iterative linear system. The basic results from
Chapter 9 are brought to bear on understanding the convergence criteria and stability of
the different numerical techniques. We shall illustrate their implementation in some simple
examples.

13.1. The Diffusion and Heat Equations.


Let us begin with a physical derivation of the heat equation from first principles
of thermodynamics. Consider a bar meaning a thin, heat-conducting body of length
`. Thin means that we can regard the bar as a one-dimensional continuum with no
transverse temperature variation. We use 0 x ` to denote the position along the bar,
and u(t, x) to indicate the temperature of the bar at position x and time t.
At each point along the bar, the heat energy is proportional to the temperature,
and so
(t, x) = (x) u(t, x),
where
(x) = (x) (x)
(13.1)
is the product of the density of the material and its heat capacity . The total amount
of heat energy contained in the portion of the bar between x and x + x is obtained by
integration:
Z x+x
Z x+x
(t, y) dy =
(y) u(t, y) dy.
(13.2)
x

The dynamical equations governing heat flow are based on two fundamental physical
laws. The first law is that, in the absence of external sources of heat along the bar, heat
energy can only enter the bar through its ends. In other words, we assume that the bar
is fully insulated along its length. Let w(t, x) denote the heat flux , i.e., the rate of flow of
heat, at position x and time t. We use the convention that w(t, x) > 0 means that heat
energy is moving to the right, while w(t, x) < 0 if it moves to the left. Then the rate of
change in heat energy (13.2) on any section of the bar equals the total heat flux, namely

We are assuming the bar is not changing in time, and so physical quantities such as density
and heat capacity depend only on position x. We also assume, perhaps with less physical justification, that the material properites do not depend upon the temperature; otherwise, we would
be led to a much more difficult nonlinear diffusion equation.

3/7/03

532

c 2003

Peter J. Olver

the amount of the heat passing through the ends. Therefore, in view of our sign convention
on the flux,
Z x+x

(y) u(t, y) dy = w(t, x + x) + w(t, x),


t x
the two terms denoting the respective flux of heat into the section of the bar at its right
and left ends. Assuming sufficient regularity of the integrand, we are permitted to bring
the derivative inside the integral. Dividing both sides of this equation by x leads to
Z x+x

1
w(x + x) w(x)

(y) u(t, y) dy =
.
x x
t
x
In the limit as the length x 0, this yields the basic differential equation
( u)
w
=
t
x

(13.3)

relating temperature u and heat flux w. This type of equation is known as a conservation
law , and, in this particular case, expresses the law of conservation of heat energy in the
form of a differential equation. See Exercise for details.
The second physical law is a constitutive assumption that relates the heat flux to the
temperature. Physical experiments in a wide variety of materials indicate that the heat
energy moves from hot to cold at a rate that is in direct proportion to the rate of change
meaning the derivative of the temperature. The resulting linear constitutive relation
w(t, x) = (x)

u
x

(13.4)

relating heat flux and temperature is known as Fouriers Law of Cooling. The proportionality factor (x) > 0 is called the thermal conductivity of the bar at position x. A good
heat conductor, e.g., silver, will have high conductivity, while a poor conductor, e.g., glass,
will have low conductivity. The minus sign (13.4) tells us that heat energy moves from hot
u
(t, x) > 0 the temperature is increasing from left to right, and so the heat
to cold; if
x
energy moves back to the left, with flux w(t, x) < 0.
Combining (13.3) and (13.4) produces the basic partial differential equation

(x)
,
0 < x < `,
(13.5)
(x) u =
t
x
x

governing the diffusion of heat in a non-uniform bar. This particular second order partial
differential equation is known as the diffusion equation, and is used to model a variety of
diffusive processes, including heat flow, chemical diffusion, population dispersion, spread
of infectious diseases, and so on. If, in addition, there are external heat sources along the
bar, with h(t, x) representing heat introduced at position x and time t, then the diffusion
equation acquires an inhomogeneous term

(x)
+ h(t, x),
0 < x < `.
(13.6)
(x) u =
t
x
x
3/7/03

533

c 2003

Peter J. Olver

In order to uniquely specify the solution u(t, x) to the diffusion equation, we need to
specify the temperature distribution
u(0, x) = f (x),

0 x `,

along the bar at the initial time, which we take to be t = 0 without any significant loss
in generality. In addition, we are required to impose suitable boundary conditions at the
two ends of the bar. As with the equilibrium equations discussed in Chapter 10, there are
three common physical types. The first is a Dirichlet boundary condition, where an end of
the bar is held at prescribed temperature. Thus, the boundary condition
u(t, 0) = (t)

(13.7)

means that at any given time t, the temperature at the left hand end, x = 0, of the bar is
fixed by the value of (t). Alternatively, the Neumann boundary condition
u
(13.8)
(t, 0) = (t)
x
u
(t, 0) at the left hand end. In particular, the
prescribes the heat flux w(t, 0) = (0)
x
homogeneous Neumann condition with (t) 0 corresponds to an insulated end, where
no heat can flow in or out. Each end of the bar should have one or the other of these
boundary conditions. For example, a bar with both ends having prescribed temperatures
is governed by the pair of Dirichlet boundary conditions
u(t, 0) = (t),

u(t, `) = (t),

(13.9)

whereas a bar with two insulated ends requires two homogeneous Neumann boundary
conditions
u
u
(13.10)
(t, 0) = 0,
(t, `) = 0.
x
x
The mixed case, with one end fixed and the other insulated, is similarly prescribed. Finally,
the periodic boundary conditions
u
u
(13.11)
(t, 0) =
(t, `),
x
x
correspond to a circular ring of length `. As before, we are assuming the heat can only
flow along the ring the insulation prevents any radiation of heat from one side of the
ring to the other.
u(t, 0) = u(t, `),

The Heat Equation


In this book, we will retain the term heat equation to refer to the homogeneous
case, in which the bar is made of a uniform material, and so its density , conductivity
, and heat capacity are all positive constants. In this case, the homogeneous diffusion
equation (13.5) reduces to the heat equation
2u
u
=
t
x2
3/7/03

534

(13.12)
c 2003

Peter J. Olver

for the temperature u(t, x) in the bar. The constant


=

(13.13)

is called the thermal diffusivity of the bar, and incorporates all of its relevant physical properties. The solution u(t, x) will be uniquely prescribed once we specify initial conditions
u(0, x) = f (x) and a suitable pair of boundary conditions at the ends of the bar.
As we learned in Section 11.1, the most basic solutions to the heat equation are based
on the exponential ansatz
u(t, x) = e t v(x),
(13.14)
where v(x) is a time-independent function. Functions of this form, which separate into
a product of a function of t times a function of x, are known as separable solutions. This
solution ansatz is the simplest instance of the general method of separation of variables,
which is the most important technique for finding explicit solutions to linear partial differential equations. Further applications of this method will appear in this and the following
Chapters.
Substituting the separable solution (13.14) into (13.12) and canceling the common
exponential factors, we find that v(x) must solve the ordinary differential equation
v 00 = v.
In other words, v constitutes an eigenfunction, K[ v ] = v, with eigenvalue , for the second derivative operator K = D 2 . The actual eigenvalues and associated eigenfunctions
will be prescribed by the boundary conditions that v inherits from u. Once we determine
the eigenvalues and eigenfunctions, we will be able to reconstruct the solution u(t, x) as a
linear combination, or, rather, infinite series in the corresponding separable eigenfunction
solutions.
Let us consider the simplest case of a uniform bar held at zero temperature at each
end. The initial and boundary conditions are
u(t, 0) = 0,

u(t, `) = 0,

u(0, x) = f (x),

t 0,

(13.15)

0 < x < `.

According to the general prescription, we need to find the eigenvalues and eigenfunctions
for the boundary value problem

d2 v
+ v = 0,
dx2

v(0) = 0,

v(`) = 0.

(13.16)

Positive definiteness of the underlying differential operator K = D 2 tells us that we


need only look for positive eigenvalues: > 0. The sceptical reader may wish to check
explicitly that if 0 or is complex, then the boundary value problem (13.16) admits
only the trivial solution v(x) 0.
Setting = 2 with > 0, the general solution to the differential equation is a
trigonometric function
v(x) = a cos x + b sin x,
3/7/03

535

c 2003

Peter J. Olver

where a, b are arbitrary constants whose values are specified by the boundary conditions.
The boundary condition at x = 0 requires a = 0. The second boundary condition requires
v(`) = b sin ` = 0.
Hence ` must be an integer multiple of , and so

2
3
,
,
,
... .
`
`
`
Thus, the eigenvalues and eigenfunctions of the boundary value problem (13.16) are
n 2
n x
(13.17)
,
vn (x) = sin
,
n = 1, 2, 3, . . . .
n =
`
`
The corresponding separable solutions (13.14) to the heat equation with the given boundary
conditions are

n x
n2 2 t
(13.18)
sin
,
n = 1, 2, 3, . . . .
un (t, x) = exp
2
`
`
=

Each represents a trigonometrically oscillating temperature profile that maintains its form
while decaying at an exponential rate to zero. The first of these,

x
2 t
sin
u1 (t, x) = exp
,
2
`
`

experiences the slowest decay. The higher frequency modes un (t, x), n 2, all decay
faster, with those having a highly oscillatory temperature profile, where n 0, going to
zero almost instantaneously. Thus, small scale temperature fluctuations tend to rapidly
cancel each other out through diffusion of heat energy.
Linear superposition can be used to construct the general series solution

X
X
n x
n2 2 t
sin
(13.19)
u(t, x) =
bn un (t, x) =
bn exp
2
`
`
n=1
n=1

as a combination of the separable solutions. Assuming that the series converges, the initial
temperature profile is

X
n x
u(0, x) =
= f (x).
(13.20)
bn sin
`
n=1

This has the form of a Fourier sine series (11.39) on the interval [ 0, ` ] for the initial
temperature profile f (x). By orthogonality of the eigenfunctions which is a direct
consequence of the self-adjointness of the underlying boundary value problem (13.16)
the coefficients are determined by the inner product formulae (11.40), and so
Z
n x
2 `
(13.21)
f (x) sin
bn =
dx,
n = 1, 2, 3, . . . .
` 0
`

The resulting solution (13.19) describes the Fourier sine series for the temperature u(t, x)
of the bar at each later time t 0. It can be shown that, for quite general initial conditions,
the Fourier series does indeed converge to a solution to the initial-boundary value problem,
[122].
3/7/03

536

c 2003

Peter J. Olver

0.2

0.2

0.2

0.1

0.1

0.1

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.1

-0.1

-0.1

-0.2

-0.2

-0.2

0.2

0.2

0.2

0.1

0.1

0.1

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.1

-0.1

-0.1

-0.2

-0.2

-0.2

Figure 13.1.

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

A Solution to the Heat Equation.

Example 13.1. Consider the initial temperature profile

0 x 51 ,

x,
1
7
u(0, x) = f (x) =
x 52 ,
5 x 10 ,

7
1 x,
10 x 1,

(13.22)

on a bar of length 1, plotted in the first graph in Figure 13.1. Using (13.21), the first few
Fourier coefficients of f (x) are computed as
b1 = 0.0448 . . . ,
b5 = 0.0081 . . . ,

b2 = 0.096 . . . ,
b6 = 0.0066 . . . ,

b3 = 0.0145 . . . ,
b7 = 0.0052 . . . ,

b4 = 0,
b8 = 0,

... .

Setting = 1, the resulting Fourier series solution to the heat equation is


u(t, x) =

bn un (t, x) =

bn e n

2 t

sin n x

n=1

n=1

= 0.0448 e

2 t

sin x 0.096 e 4

sin 2 x 0.0145 e 9

sin 3 x .

In Figure 13.1, the solution is plotted at the successive times t = 0., .02, .04, . . . , .1. The
reader should note that the corners in the initial data are immedaitely smoothed out. As
time progresses, the solution decays at an exponential rate of 2 9.87 to a uniform,
zero temperature, which is the equilibrium temperature distribution for the homogeneous
Dirichlet boundary conditions. As the solution decays to thermal equilibrium, it also
assumes the progressively more symmetric shape of a single sine arc of exponentially decreasing amplitude, which is merely the profile of the first term in its Fourier series.
Smoothing and Long Time Behavior
The fact that we can write the solution to the initial-boundary value problem (13.12),
(13.15), for the heat equation in the form of an infinite series is progress of a sort. However,
because it cannot be summed in closed form, this solution is considerably less satisfying
3/7/03

537

c 2003

Peter J. Olver

than having a direct, explicit formula. Nevertheless, there are important properties of the
solution that can be gleaned from such series expansions.
If the initial data f (x) is piecewise continuous, then its Fourier coefficients are uniformly bounded:
Z
Z
2 `
n x
2 `
| bn |
| f (x) | dx M,
for all
n. (13.23)
f (x) sin
dx
` 0
`
` 0
Indeed, this property holds even for quite irregular data; for example, the Fourier coefficients (11.55) of the delta function are also uniformly bounded. Under these conditions,
each term in the series solution (13.19) is bounded by an exponentially decaying function

2 2
2 2

b exp n t sin n x M exp n t .


n
`2
`
`2

This means that, as soon as t > 0, most of the high frequency terms, n 0, will be
extremely small. Only the first few terms will be at all noticeable, and so the solution essentially degenerates into a finite sum over the first few Fourier modes. As time increases,
more and more of the Fourier modes will become negligible, and the sum further degenerates into progressively fewer significant terms. Eventually, as t , all of the Fourier
modes will decay to zero. Therefore, the solution will converge exponentially fast to a zero
temperature profile: u(t, x) 0 as t , representing the bar in its final uniform thermal
equilibrium. The fact that its equilibrium temperature is zero is a direct consequence of
the fact that we are holding both ends of the bar fixed at zero temperature any initial
heat in the bar will eventually be dissipated away. The last term to disappear is the one
with the slowest decay, namely

Z
2
x
1
u(t, x) b1 exp 2 t sin
f (x) sin x dx. (13.24)
,
where
b1 =
`
`
0
The solution approaches thermal equilibrium exponentially fast with rate equal to the
first (or, in exceptional cases , a higher) eigenvalue, 2 /`2 , which is proportional to the
thermal diffusivity divided by the square of the length of the bar. The longer the bar,
or the smaller the diffusivity, the longer it takes for the effect of holding the ends at zero
temperature to propagate along the entire bar. Also, again provided b 1 6= 0, the asymptotic
shape of the temperature profile is a small sine arc, just as we observed in Example 13.1.
The heat equations smoothing effect on irregular initial data by fast damping of the
high frequency modes underlies its effectiveness for smoothing out and denoising signals.
We take initial data u(0, x) = f (x) to be a noisy signal, and then evolve the heat equation
forward to a certain time ts tar > 0. The resulting function g(x) = u(ts tar, x) will be a
smoothed version of f (x) that eliminates most of the high frequency noise. Of course, if we
run the heat flow for too long, all of the low frequency features will be also be completely
smoothed out and the result will be a uniform, constant signal. Thus, the choice of stopping

More specifically, if b1 = 0, then the eigenvalue corresponding to the first nonzero term in
the series will govern the decay rate.

3/7/03

538

c 2003

Peter J. Olver

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Figure 13.2.

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Denoising a Signal Using the Heat Equation.

time ts tar is crucial to the success of this method. See Figure 13.2 for a sample, applied
to the same signal as in Figure 12.6. The heat equation has been run, with = 1, to times
t = 0., .00001, .00005, .0001, .001, .01. Notice how quickly the noise is removed. However,
by the final time, the overall smoothing effect of the heat flow has caused significant
degradation (blurring) of the original signal. The heat equation approach to denoising
has the advantage that no Fourier coefficients need be explicitly computed, nor does one
need to reconstruct the smoothed signal from its remaining Fourier coefficients. The final
section discusses numerical methods that can be used to solve the heat equation directly.
Another, closely related observation is that, for any fixed time t > 0 after the initial
moment, the coefficients in the Fourier series (13.19) decay exponentially fast as n .
According to the discussion at the end of Section 11.3, this implies that the solution
u(t, x) is a very smooth, infinitely differentiable function of x at each positive time t, no
matter how unsmooth the initial temperature profile. We have discovered the fundamental
smoothing property of heat flow.
Theorem 13.2. If u(t, x) is a solution to the heat equation with piecewise continuous
initial data f (x) = u(0, x), or, more generally, initial data satisfying (13.23), then, for any
t > 0, the solution u(t, x) is an infinitely differentiable function of x.
After even a very short amount of time, the heat equation smooths out most, and,
eventually, all of the fluctuations in the initial temperature profile. As a consequence, it
becomes impossible to reconstruct the initial temperature u(0, x) = f (x) by measuring the
temperature distribution h(x) = u(t, x) at a later time t > 0. Diffusion is irreversible we
cannot run the heat equation backwards in time! Indeed, if the initial data u(0, x) = f (x)
is not smooth, there is no function u(t, x) for t < 0 that could possibly yield such an
initial distribution because all corners and singularities are smoothed out by the diffusion
process as t goes forward! Or, to put it another way, the Fourier coefficients (13.21) of any
purported solution will be exponentially growing when t < 0, and so noise will completely
overwhelm the solution. For this reason, the backwards heat equation is said to be ill-posed .
On the other hand, the unsmoothing effect of the backwards heat equation also has
3/7/03

539

c 2003

Peter J. Olver

potential benefits. For example, in image processing, diffusion will gradually blur an image. Image enhancement is the reverse process, and so can be done by running the heat
flow backwards in some well-prescribed manner. For instance, one can restrict to the
first few Fourier modes, and then the corresponding backwards evolution is well-defined.
Similar problems occur in the reconstruction of subterranean profiles from seismic data, a
problem of considerable interest in the oil industry. A considerable amount of contemporary research activity is being devoted to cleverly circumventing the ill-posedness of the
backwards heat flow.
Remark : The irreversibility of the heat equation points out yet another important
difference between partial differential equations and ordinary differential equations. The
latter are always reversible although the detailed qualitative and quantitative properties
of solutions can very well depend upon whether time is running forwards or backwards.
Inhomogeneous Boundary Conditions
So far, we have concentrated our attention on homogeneous boundary conditions.
There is a simple trick that will convert a boundary value problem with inhomogeneous
but constant Dirichlet boundary conditions,
u(t, 0) = ,

u(t, `) = ,

t 0,

(13.25)

into a homogeneous Dirichlet problem. Consider the affine function


g(x) = +

x,
`

(13.26)

that interpolates the boundary data. The difference


u
e(t, x) = u(t, x) g(x) = u(t, x)

x
`

(13.27)

has homogeneous boundary conditions at both ends:

u
e(t, 0) = 0 = u
e(t, `).

Moreover, the affine function (13.26) is a stationary (meaning it does not depend upon
the time t) solution to the heat equation, and so, by linearity, the modified temperature
u
e(t, x) also solves the heat equation. The initial data is similarly modified

x.
u
e(0, x) = fe(x) = f (x) g(x) = f (x)
`

We can therefore write u


e(t, x) in Fourier series form (13.19), where the Fourier coefficients
are computed for the modified initial data fe(x). The solution to the inhomogeneous
boundary value problem has the series form

X
n x
n2 2

e
x +
t sin
,
(13.28)
bn exp
u(t, x) = +
2
`
`
`
n=1
3/7/03

540

c 2003

Peter J. Olver

where
eb = 2
n
`

`
0

n x
dx,
fe(x) sin
`

n = 1, 2, 3, . . . .

(13.29)

Since, for any reasonable initial data, u


e(t, 0) 0 will decay to zero at an exponential rate
as t , the actual temperature profile (13.28) will asymptotically decay to the affine
profile,

u(t, x) +
x
`
at the same exponentially fast rate.
If the boundary conditions are time-dependent, = (t), = (t), the substitution
(13.27) can be used to convert to homogeneous boundary conditions, but the time-varying
affine function
(t) (t)
x,
g(t, x) = (t) +
`
does not solve the heat equation. The resulting function u
e(t, x) = u(t, x) g(t, x) will
satisfy an inhomogeneous version of the heat equation
2u
e
e
u
=
h(t, x),
t
x2

where

h(t, x) =

g
(t, x)
t

represents a time-varying external heat source along the bar. Solution techniques in this
case will be discussed below.
The Heated Ring
Let us next consider the periodic boundary conditions (13.11), which govern heat
flow in an insulated circular ring. Let us fix the length of the ring to be ` = 2 , with
< x < representing angular coordinate around the ring. For simplicity, we also
choose units in which the thermal diffusivity is = 1. Thus, we seek to solve the heat
equation
2u
u
(13.30)
=
,
< x <
t > 0,
t
x2
subject to periodic boundary conditions
u
u
( , t) =
(, t),
x
x

u( , t) = u(, t),

t 0.

(13.31)

The initial temperature distribution is


u(0, x) = f (x),

< x < .

(13.32)

The resulting temperature u(t, x) will be a periodic function in x of period 2 .


Substituting the separable solution ansatz u(t, x) = e t v(x) into the heat equation
and boundary conditions leads to the periodic eigenvalue problem
d2 v
+ v = 0,
dx2
3/7/03

v( ) = v(),
541

v( ) = v().
c 2003

(13.33)
Peter J. Olver

The solutions are the trigonometric functions


vn (x) = cos n x,

ven (x) = sin n x,

n = 0, 1, 2, . . . ,

with corresponding eigenvalues n = n2 . The corresponding separable solutions to the


heated ring equation are
un (t, x) = e n

u
en (t, x) = e n

cos n x,

sin n x,

n = 0, 1, 2, 3, . . . .

The resulting infinite series solution is


u(t, x) =

1
2

a0 +

an e n t cos n x + bn e n

n=1

The initial conditions require


u(0, x) =

1
2

a0 +

n=1

sin n x .

an cos n x + bn sin n x = f (x),

(13.34)

(13.35)

which is precisely the Fourier series of the initial temperature profile f (x). Consequently,
Z
Z
1
1
an =
f (x) cos n x dx,
bn =
f (x) sin n x dx,
(13.36)


are the usual Fourier coefficients of f (x).
As in the Dirichlet case, for any positive time t > 0, the high frequency terms in the
2
series (13.34) are extremely small since e n t 1 for n 0 . Therefore, as soon as
t > 0, only the initial terms will be of any appreciable size, and the solution essentially
degenerates into a finite sum over the first few Fourier modes. Moreover, as t , all
of the Fourier modes except the constant one, with eigenvalue 0 = 0, will decay to zero.
Therefore, the solution will converge exponentially fast to a constant temperature profile:
Z
1
1
u(t, x)
a =
f (x) dx,
2 0
2
which equals the average of the initial temperature profile. Physically, we observe that the
heat energy is redistributed so that the ring achieves a uniform constant temperature and
is in thermal equilibrium. Indeed, the total heat energy
Z
u(t, x) dx = constant
(13.37)
E=

is conserved, meaning constant, for all time; a proof of this fact can be found in Exercise
.
Prior to equilibrium, only the lowest frequency Fourier modes will still be noticeable,
and so the solution will asymptotically look like
u(t, x)
3/7/03

1
2

a0 + e t (a1 cos x + b1 sin x) =


542

1
2

a0 + r1 e t cos(x + 1 ),
c 2003

(13.38)
Peter J. Olver

where
1
a1 = r1 cos 1 =
2

1
b1 = r1 sin 1 =
2

f (x) cos x dx,

f (x) sin x dx.

Thus, for most initial data, the solution approaches


p thermal equilibrium exponentially
fast, with rate 1. The exceptions are when r1 = a21 + b21 = 0, for which the rate of
convergence
is even faster specifically at e k t where k is the smallest integer such that
p
rk = a2k + b2k 6= 0.
The Fundamental Solution

The disadvantage of the Fourier series solution to the heat equation is that it is not
nearly as explicit as one might desire. An alternative, and quite useful approach is based
on the idea of the fundamental solution, which derives its inspiration from the Greens
function method for solving boundary value problems, and measures the effect of an initial
concentrated heat source.
Let us assume initially that we are dealing with homogeneous boundary conditions.
The idea is to first analyze the case when the initial data u(0, x) = y (x) = (x y) is a
delta function, which we can interpret as a highly concentrated unit heat source applied
at position y along the bar. The heat will diffuse away from its initial concentration, and
the resulting fundamental solution is denoted by
u(t, x) = F (t, x; y),

with

F (0, x; y) = (x y).

(13.39)

For each fixed y, the fundamental solution, as a function of t > 0 and x, must satisfy the
differential equation as well as the homogeneous boundary conditions.
Once we have found the fundamental solution, we can then use linear superposition
to reconstruct the general solution. Namely, we first write the initial data
Z `
u(0, x) = f (x) =
(x y) f (y) dy
(13.40)
0

as a superposition of delta functions, as in (10.35). Linearity implies that the solution is


then the same superposition of the responses to those concentrated delta profiles:
Z `
u(t, x) =
F (t, x; y) f (y) dy.
(13.41)
0

Assuming that we can differentiate under the integral sign, the fact that F (t, x; y) satisfies
the differential equation and the homogeneous boundary conditions for each fixed y immediately implies that the integral (13.41) is also a solution, and, moreover, has the correct
initial and (homogeneous) boundary conditions.
Unfortunately, most boundary value problems do not have fundamental solutions that
can be written down in closed form. An important exception is the case of an infinitely
long homogeneous bar, which requires solving the heat equation
u
2u
=
,
t
x2
3/7/03

< x < ,
543

(13.42)

t > 0.
c 2003

Peter J. Olver

For simplicity, we have chosen units in which the thermal diffusivity is = 1. The solution
u(t, x) is defined for all x R, and has initial conditions
u(0, x) = f (x)

for

< x < .

In order to specify the solution uniquely, we require that the temperature be squareintegrable at all t, so that
Z
| u(t, x) |2 dx <
for all
t 0.
(13.43)

These are the only boundary conditions that need be imposed in this situation.
On an infinite interval, the Fourier series solution to the heat equation becomes a
Fourier integral. We write the initial temperature distribution as a superposition
Z
f (x) =
fb(k) e2 i k x dk,

of complex exponentials e2 i k x , where fb(k) is the Fourier transform (12.69) of f (x). The
corresponding separable solutions to the heat equation are

2 2
2 2
(13.44)
u(t, x) = e 4 k t e2 i k x = e 4 k t cos 2 i k x + i sin 2 i k x ,

where the frequency variable k is allowed to assume any real value. We combine these
complex solutions into a Fourier integral
Z
2 2
u(t, x) =
(13.45)
e 4 k t e2 i k x fb(k) dk

to form the solution to the initial value problem for the heat equation.
In particular, to recover the fundamental solution, we take the initial temperature
profile to be a delta function y (x) = (x y) concentrated at x = y. According to
(12.97), its Fourier transform is
by (k) = e 2 i k y .

Plugging this into (13.45), and then referring to our table of Fourier transforms, we find
the explicit formula for the fundamental solution
Z
2
2 2
1
(13.46)
F (t, x; y) =
e 4 k t e2 i k (xy) dk = e (xy) /4 t .
2 t

The reader may wish to verify that the final formula is indeed a solution to the heat
equation for all t > 0. Note that

0,
x 6= y,
lim F (t, x; y) =
t 0+
,
x = y.

Interestingly, the individual Fourier solutions (13.44) are not square integrable. Nevertheless,
when they are combined together in (13.45), the resulting solution does satisfy (13.43). The
analytical details can be found in an advanced text in Fourier analysis, [ 118 ].

3/7/03

544

c 2003

Peter J. Olver

-6

-4

1.4

1.4

1.2

1.2

0.8

0.8

0.6

0.6

0.4

0.4

0.2

0.2

-2

-6

-4

-2

1.4

1.4

1.2

1.2

0.8

0.8

0.6

0.6

0.4

0.4

0.2
-6

-4

-2

Figure 13.3.

0.2
2

-6

-4

-2

The Fundamental Solution to the Heat Equation.

Furthermore, the integral


E=

F (t, x; y) dx = 1

(13.47)

of the fundamental solutions represents the total heat energy, which, as in (13.37), is
constant. We conclude that, at the initial time, the fundamental solution
F (0, x; y) = y (x)
satisfies the original limiting definition of the delta function. As graphed in Figure 13.3
at times t = .05, .1, 1., 10., F (t, x; y) represents a function that starts out as a delta spike
at x = y and then immediately smoothes out into a tall and narrow bell-shaped curve,
centered at x = y. As time increases, the solution shrinks and widens, decaying everywhere
to zero with amplitude proportional to t1/2 and width proportional to t1/2 . The total
heat energy (13.47) remains fixed while it gradually spreads out along the entire real line.
Remark : In probability, these exponentially bell-shaped curves are known as normal
or Gaussian distributions. The width of the bell curve corresponds to the standard deviation. For this reason, the fundamental solution to the heat equation sometimes referred
to as a Gaussian filter.
Remark : One of the non-physical artifacts of the heat equation is that it permits
heat energy to propagate with infinite speed. Indeed, the effect of a initially localized
concentration of heat energy will immediately be felt along the entire length of an infinite
bar, because, at any t > 0, the fundamental solution is nonzero for all x. (The graphs
in Figure 13.3 are a little misleading because they fail to show the extremely small, but
still positive exponentially decreasing tails in the solution.) This effect, while more or less
negligible at large distances, is nevertheless in clear violation of physical intuition not
to mention relativity that postulates that signals cannot propagate faster than the speed
of light. Despite this non-physical effect, the heat equation remains an extremely accurate
and effective model for basic heat flow and similar diffusive phenomena.
3/7/03

545

c 2003

Peter J. Olver

With the fundamental solution in hand, we can then adapt the linear superposition
formula (13.41) to reconstruct the general solution
1
u(t, x) =
2 t

e (xy)

/4 t

f (y) dy

(13.48)

to our initial value problem (13.42). In other words, the solutions are obtained by convolution, (12.111),
u(t, x) = F (t, x) f (x),
of the initial data with a one-parameter family of progressively wider and shorter Gaussian
filters (13.46). Thus, the Gaussian filter convolution has a smoothing effect on the signal
f (x). Indeed, the convolution integral (13.48) serves to replace each initial value f (x) by a
weighted average of nearby values, the weight being determined by the exponential Gaussian distribution. Thus, a (weighted) averaging operation has the effect of smoothing out
high frequency variations in the signal. Consequently, the Gaussian convolution formula
(13.48) provides an effective method of signal and image denoising. In fact, for practical
reasons, the graphs displayed earlier in Figure 13.2 were computed by using a standard
numerical integration routine on the convolution (13.48) rather than a numerical solution
scheme for the heat equation.
Example 13.3. An infinite bar is initially heated to unit temperature along a finite
interval. This corresponds to an initial temperature profile
u(0, x) = f (x) = (x a) (x b) =

1,
0,

a < x < b,
otherwise.

The corresponding solution to the heat equation is obtained by the integral formula (13.48),
producing
1
u(t, x) =
2 t

e
a

(xy)2 /4 t

1
dy =
2

where
2
erf x =

erf
Z

xa

2 t

erf

xb

2 t

e z dz

(13.49)

(13.50)

is known as the error function due to its applications in probability and statistics. A
graph appears in Figure 13.4. The error function integral cannot be written in terms of
elementary functions. Nevertheless, its importance in various applications means that its
properties have been well studied, and its values tabulated, [39].
A graph of the solution (13.49) when a = 5, b = 5, at successive times t =
0., .1, 1, 5, 30, 300, is displayed in Figure 13.5. Note the initial smoothing or blurring of
the sharp interface, followed by a gradual decay of the temperature to thermal equilibrium.
3/7/03

546

c 2003

Peter J. Olver

0.5

-1

-2

-0.5

-1

The Error Function.

Figure 13.4.

-10

-10

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4

0.2

0.2

0.2

-5

-5

10

-10

-5

10

-10

-5

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4

0.2

0.2

0.2

10

Figure 13.5.

-10

-5

10

-10

-5

10

10

Error Function Solution to the Heat Equation.

13.2. Similarity and Symmetry Methods.


So far, save for an interlude devoted to complex analysis methods for solving the
Laplace equation, we have relied almost exclusively upon the method of separation of
variables to construct explicit solutions to linear partial differential equations. Beyond
this, the most useful solution technique is through exploitation of some inherent symmetry
property of the differential equation. Unlike separation of variables methods, symmetry
methods can be also applied to produce solutions to nonlinear partial differential equations;
see Chapter 21 for some simple examples. While we do not have the space or mathematical
sophistication to develop the full apparatus of symmetry techniques, we shall introduce
the important concept of a similarity solution, and use it to make further progress on our
understanding of the heat equation.
Remark : The general symmetry method is founded on the theory of Lie groups, named
after the influential nineteenth century Norwegian mathematician Sophus Lie (pronounced
Lee). The application of Lies symmetry methods to partial differential equations arising
in physics and engineering can be traced back to the influential book of G. Birkhoff, [14],
on hydrodynamics. A complete and comprehensive treatment of symmetry methods can
3/7/03

547

c 2003

Peter J. Olver

be found in the first authors book [97], and, at a more introductory level, in the books
by Hydon, [68], and Seshadri and Na, [110].
In general, by a symmetry of an equation, we mean a transformation that takes solutions to solutions. Thus, if we know a symmetry, and one solution, then we can construct a
second solution by applying the symmetry. And, possibly, a third solution by applying the
symmetry yet again. And so on. If we know lots of symmetries, then we can produce lots
and lots of solutions by this simple device. Actually, we have already made extensive use
of symmetries in the conformal mapping approach to the Laplace equation. Any conformal
mapping will map a harmonic function, or solution to the Laplace equation, to another
solution. This seminal observation underlies the power of the conformal mapping method
for solving boundary value problems.
Unfortunately, most differential equations do not have the richness in symmetry enjoyed by the planar Laplace equation. (Even its three-dimensional counterpart has only a
limited collection of conformal symmetries, [97].) Lies theory provides an algorithm for
completely determining all the symmetries of a given differential equation, but this will
not be possible to describe in this introductory text. However, by relying on inspection or
physical intuition, we can often detect the most important symmetries without appealing
to such a sophisticated theory.
The heat equation ut = uxx serves as an excellent testing ground for the general
symmetry methodology, as it admits a rich variety of symmetry transformations that take
solutions to solutions. The simplest are the translational symmetries. Translating the
space and time coordinates by a fixed amount,
t 7 t ,

x 7 x ,

(13.51)

where , are arbitrary constants, changes the function u(t, x) into the translated function
U (t, x, y) = u(t , x ).

(13.52)

A simple application of the chain rule proves that the partial derivatives of U with respect
to t and x agree with the corresponding partial derivatives of u, so U t = ut , Ux = ux , Uxx =
uxx , and so on. In particular, the function U (t, x) is a solution to the heat equation
Ut = Uxx whenever u(t, x) solves ut = uxx . Physically, the translational symmetry
formalizes the property that the heat equation models a homogeneous medium, and hence
the solution does not depend on the choice of origin or reference point in our coordinate
system.
As a consequence, once we know one solution to the heat equation, we can apply an
arbitrary translation (13.52), and thereby produce an infinite family of translated solutions.
For example, starting with
u(t, x) = e t sin x,
which is a special case of our separable solutions (13.18), we immediately produce the
translated solutions
u(t, x) = e (t ) sin (x ),
for any choice of constants , .
3/7/03

548

c 2003

Peter J. Olver

Typically, symmetries of a differential equation do not respect initial or boundary


conditions, and so are of most use when the problem is posed on the entire space. For
instance, if u(t, x) is defined for t > 0 and in the domain 0 x `, then its translated
version U (t, x) is defined for t > and in the translated domain x ` + , and so will
solve an appropriately modified initial-boundary value problem.
A second, even more important class of symmetries are the scaling invariances. We
already know that if u(t, x) is a solution, so is any scalar multiple c u(t, x); this is a
simple consequence of linearity of the heat equation. We can also add any constant to the
temperature, noting that
U (t, x) = c u(t, x) + k
(13.53)
is a solution for any choice of constants c, k. Physically, the transformation (13.53) amounts
to a change in the scale for measuring temperature. For instance, if u is measured degrees
Celsius, and we set c = 95 , and k = 32, then U = 59 u + 32 will be measured in degrees
Fahrenheit. Thus, reassuringly, the physical processes described by the heat equation do
not depend upon our choice of thermometer.
More interestingly, suppose we rescale the space and time variables:
t 7 t,

x 7 x,

(13.54)

where , > 0 are positive constants. The effect of such a scaling transformation is to
change the function u(t, x) into a rescaled function
U (t, x) = u( t, x).

(13.55)

The derivatives of U are related to those of u according to the following formulae, which
are simple consequences of the multi-variable chain rule:
2
U
u
U
u
2U
2 u
=
,
=
,
=
.
t
t
x
x
x2
x2
Therefore, if u satisfies the heat equation ut = uxx , then U satisfies the rescaled heat
equation

Ut = ut = uxx = 2 Uxx ,

which we rewrite in the form


Ut = Uxx .

(13.56)

The diffusion coefficient for the rescaled heat equation is


=

.
2

(13.57)

Thus, the net effect of scaling space and time is merely to rescale the diffusion coefficient
in the heat equation.
Remark : Physically, the scaling symmetry (13.54) corresponds to a change in the
physical units used to measure time and distance. For instance, to change from seconds to
minutes, set = 60, and from meters to yards, set = 1.0936. The net effect (13.57) on
the diffusion coefficient is a reflection of its physical units, namely distance 2 /time.
3/7/03

549

c 2003

Peter J. Olver

In particular, if we choose
=

1
,

= 1,

then the rescaled diffusion coefficient is = 1. This observation has the following important
consequence. If U (t, x) solves the heat equation for a unit diffusivity, = 1, then
u(t, x) = U ( t, x)

(13.58)

solves the heat equation for a non-unit value of the diffusivity . Thus, the only effect
of the diffusion coefficient is to speed up or slow down the time variable! A body with
diffusivity = 2 will cool down twice as fast as a body (of the same shape subject to the
same boundary conditions and initial conditions) with diffusivity = 1. Note that this
particular rescaling has not altered the space coordinates, and so U (t, x) is defined on the
same domain as u(t, x).
On the other hand, if we set = 2 , then the rescaled diffusion coefficient is exactly
the same as the original: = . Thus, the particular rescaling
t 7 2 t,

(13.59)

x 7 x,

does not alter the equation, and hence defines a symmetry, known as a similarity transformation, for the heat equation. Combining (13.59) with the linear rescaling u 7 c u, we
make the elementary, but important observation that if u(t, x) is any solution to the heat
equation ut = uxx , then so is the function
U (t, x) = c u( 2 t, x)

(13.60)

for any choice of (nonzero) scale parameters c, . Rescaling by non-unit 6= 1 will alter
the domain of definition of the solution. If u(t, x, y) is defined for 0 x `, then U (t, x)
is defined for 0 x `/.
In particular, suppose that we have solved the heat equation for the temperature u(t, x)
on a bar of length 1, subject to certain initial and boundary conditions. We are then given
a bar composed of the same material of length 2. Thus, the diffusivity coefficient has not
changed, and we construct the corresponding solution U (t, x) by rescaling. A scaling factor
we
also rescale time by a factor = 2 = 14 , then
= 21 will serve to double the length.
1 If

the rescaled function U (t, x) = u 4 t, 12 x will be a solution of the heat equation on the
longer bar with the same diffusivity constant. The net effect is that the rescaled solution
will be evolving four times as slowly as the original solution u(t, x, y). Thus, it effectively
takes a bar that is double the size four times as long to cool down. check
The Inhomogeneous Heat Equation
The fundamental solution can be also used to solve the inhomogeneous problem
ut = uxx + h(t, x),

(13.61)

where the bar is subject to an external heat source h(t, x), that may depend upon both
the position along the bar and time. We begin by solving the particular problem
ut = uxx + (t s) (x y).
3/7/03

550

(13.62)
c 2003

Peter J. Olver

Here, the forcing term represents a unit heat source that is concentrated at a position
0 < y < ` and applied instantaneously at a single time t = s > 0. We impose the
homogeneous initial conditions
u(0, x) = 0
(13.63)
as well as homogeneous boundary conditions of one of our specified types. The solution to
this problem will be denoted as
u(t, x) = G(t, x; s, y)

(13.64)

and is referred to as the general fundamental solution to the heat equation. Since a heat
source which is applied at time s will only affect the solution at later times t s, we have
G(t, x; s, y) = 0

for all

t < s.

(13.65)

Once we know the general fundamental solution (13.64), we are able to solve the
problem for a general external heat source (13.61) by appealing to linearity. We first write
the forcing as a superposition
Z Z `
h(t, x) =
h(s, y) (t s) (x y) dy ds
(13.66)
0

of concentrated instantaneous heat sources. Linearity allows us to conclude that the solution is given by the superposition formula
Z tZ `
h(s, y) G(t, x; s, y) dy ds.
(13.67)
u(t, x) =
0

The fact that we only need to integrate over times 0 s t follows from (13.65).

Remark : If we have a nonzero initial condition, u(0, x) = f (x), then we use linear
superposition to write the solution
Z `
Z tZ `
u(t, x) =
F (t, x; y) f (y) dy +
h(s, y) G(t, x; s, y) dy ds
(13.68)
0

as a combination of
(a) the solution with no external heat source, but inhomogeneous initial conditions, plus
(b) the solution with homogeneous initial conditions but nonzero heat source.

Let us solve the forced heat equation in the case of a infinite bar, so < x < .
We begin by computing the general fundamental solution to (13.62). Taking the Fourier
transform of both sides of the partial differential equation with respect to x and using
(12.97), (12.101), we find
b
u
+ 4 2 k2 u
b = e 2 i k y (t s).
(13.69)
t
This is an inhomogeneous first order ordinary differential equation for the Fourier transform
u
b(t, k) of u(t, x). Assuming s > 0, by (13.65), the initial condition is
3/7/03

u
b(0, k) = 0.
551

(13.70)

c 2003

Peter J. Olver

We solve the initial value problem by the usual method, [20]. Multiplying (13.69) by the
2 2
integrating factor e4 k t yields
2 2
4 2 k2 t
e
u
b = e4 k t2 i k y (t s),
t
and so
2 2
u
b(t, k) = e4 k (st)2 i k y (t s),

where (t) is the usual step function (10.40). Using the inverse Fourier transform formula (12.71) and then (13.46), we deduce that
Z
2 2
u(t, x) = G(t, x; s, y) = (t s)
e4 k (st)+2 i k (xy) dk

(x y)2
(t s)
exp
= p
= (t s) F (t s, x y) .
4 (t s)
2 (t s)

Thus, the general fundamental solution is obtained by translating the fundamental solution
F (t, x; y) = F (t, x y) for the initial value problem to a starting time of t = s instead of
t = 0. The effect of an initial concentrated temperature is exactly the same as that of a
concentrated heat source applied at the initial time; see [77; p. 15] for further details. The
superposition principle (13.67) produces the solution

Z tZ
h(s, y)
(x y)2
p
dy ds.
(13.71)
u(t, x) =
exp
4 (t s)
0 2 (t s)

to the heat equation with source term on an infinite bar.


The Root Cellar Problem

As a final example, we discuss a problem that leads to analysis of the heat equation on
a semi-infinite interval. How deep should you dig a root cellar? A root cellar should store
food to be cool in the summer, but not freeze in the winter. We assume the temperature
only depends on the depth and the time of year, and let u(t, x) denote the deviation in the
temperature in the earth at depth x > 0 and time t from its mean value. We shall assume
that the temperature at the surface x = 0 fluctuates in a periodic manner; specifically, we
set
u(t, 0) = a cos t.
(13.72)
At large depth the temperature is assumed to be nonvarying:
u(t, x) 0

as

x ,

(13.73)

where 0 refers to the mean temperature. The frequency


=

2
= 2.0 107 sec1
365.25 days

(13.74)

refers to the yearly temperature variation. We shall ignore daily temperature fluctuations
in this model as their effect is not significant below a thin surface layer of the earth.
3/7/03

552

c 2003

Peter J. Olver

Thus, we must solve the heat equation on a semi-infinite bar 0 < x < , with timedependent boundary conditions (13.72), (13.73) at the ends. The analysis will be simplified
a little if we replace the cosine by a complex exponential, and so look for a complex solution
with boundary conditions
u(t, 0) = a e i t ,

lim u(t, x) = 0.

Let us try a separable solution of the form


u(t, x) = v(x) e i t .

(13.75)

Substituting this expression into the heat equation ut = uxx leads to


i v(x) e i t = v 00 (x) e i t .
We conclude that v(x) should solve the boundary value problem
v 00 (x) = i v,

v(0) = a,

lim v(x) = 0.

The solutions to the ordinary differential equation are

i / x
/2 (1+ i ) x
i / x
/2 (1+ i ) x

=e
=e
and
v2 (x) = e
.
v1 (x) = e
The first solution is exponentially growing as x , and so not appropriate to our
problem. The solution to the boundary value problem must therefore be a multiple,

v(x) = a e /2 (1+ i ) x
of the exponentially decaying solution. Substituting back into (13.75), we find the (complex) solution to the root cellar problem to be

x /2 i t /2 x
u(t, x) = a e
e
.
(13.76)
The corresponding real solution is obtained by taking the real part,

x /2
x .
u(t, x) = a e
cos t
2

(13.77)

The first term in (13.77) is exponentially decaying as a function of the depth. Thus, the
further down one goes, the less noticeable surface temperature fluctuations are. The second
term is periodic with the same annual frequency . The interesting feature is the phase
lag in the response. The temperature at a given depth x is out of phase with respect to
the surface temperature fluctuations, with the phase lag
r

=
x
2
depending linearly on depth. In particular, a cellar built at a depth where is an odd
multiple of will be completely out of phase, being hottest in the winter, and coldest in
3/7/03

553

c 2003

Peter J. Olver

the summer. Thus, the (shallowest) ideal depth at which to build a root cellar would take
= , corresponding to a depth of
r
2
.
x=

For typical soils in the earth, 106 meters2 sec1 , [X], and hence, by (13.74),
x 9.9 meters. However, at this depth, the relative amplitude of the oscillations is

e x /2 = e = .04
and hence there is only a relative 4% temperature fluctuation. In Minnesota, the temperature varies, roughly, from 40 C to + 40 C, and hence our root cellar would experience
only a 3.2 C annual temperature deviation from the winter, when it is the warmest, to
the summer, where it is the coldest. Building the cellar twice as deep would lead to a
temperature fluctuation of .2%, in phase with the surface variations, which means that the
cellar is, for all practical purposes, at constant temperature year round.

13.3. The Wave Equation.


The second important class of dynamical partial differential equations are those modeling vibrations of continuous media. As we saw in Chapter 8, Newtons Law implies that
the free vibrations of a discrete mechanical system are governed by a second order system
of ordinary differential equations of the form
M

d2 u
= K u,
dt2

in which M is the positive definite, diagonal mass matrix, while K = A A = AT CA is the


positive definite (or semi-definite in the case of an unstable system) stiffness matrix.
The corresponding dynamical equations describing the vibrations of continuous media
take an entirely analogous form

2u
= K[ u ].
t2

(13.78)

In this case, denotes the density of the medium, while K = L L is the same selfadjoint differential operator, with appropriate boundary conditions, that appears in the
equilibrium equations. For one-dimensional media, such as a vibrating bar or string,
Newtons Law leads to a partial differential equation in the particular form

2u

u
(13.79)
(x) 2 =
(x)
,
0 < x < `,
t
x
x
where (x) is the density of the bar or string at position x, while (x) > 0 denotes its
stiffness or tension. Since this partial differential equation is second order in time, we are
required to specify both the initial displacement and the initial velocity:
u
(0, x) = g(x).
t

u(0, x) = f (x),
3/7/03

554

(13.80)
c 2003

Peter J. Olver

In addition, we must impose suitable boundary conditions to ensure self-adjointness of the


underlying differential operator. The usual boundary conditions Dirichlet, Neumann,
mixed or periodic continue to play a central role, and have immediate physical interpretations. Tying down an end of string imposes a Dirichlet condition u(t, 0) = . A free
end is prescribed by a homogeneous Neumann boundary condition u x (t, 0) = 0. Periodic
boundary conditions, as in (13.11), correspond to the vibrations of a circular ring.
The second order partial differential equation (13.79) models the dynamics of vibrations and waves in a broad range of continuous media, including elastic vibrations of a bar,
sound vibrations in a column of air, e.g., inside a wind instrument, and also transverse
vibrations of a string, e.g., a violin string. (However, bending vibrations of a beam lead to
a fourth order partial differential equation; see Exercise .) It also be used to model small
amplitude water waves, electromagnetic waves, including light, radio and microwaves, and
many others.
The simplest case is when the medium is homogeneous, and so both density and
stiffness are constant. Then the general vibration equation (13.79) reduces to the onedimensional wave equation
2
2u
2 u
=
c
.
(13.81)
t2
x2
The constant
c=

> 0

(13.82)

is called the wave speed , for reasons that will soon become apparent.
The method for solving such vibration equations is motivated by our solution in the
discrete case discussed in Section 8.8. To keep matters simple, we shall concentrate on the
homogeneous wave equation, although the underlying technique can be similarly applied
to the general version (13.79). Since we expect time periodic solutions, we try a separable
solution
u(t, x) = cos t v(x)
(13.83)
with trigonometric time dependence. Differentiating (13.83), we find
2u
= 2 cos t v(x),
t2

2u
= cos t v 00 (x).
x2

Substituting these formulae into the wave equation (13.81) and canceling the common
cosine factors, we deduce that v(x) must satisfy the differential equation
c2

d2 v
+ 2 v = 0.
dx2

(13.84)

Ignoring the boundary conditions for the moment, if 6= 0, the solutions are the trigonox
x
metric functions cos
, sin
, and so we have constructed the explicit solutions
c
c
cos t cos
3/7/03

x
,
c
555

cos t sin

x
,
c
c 2003

Peter J. Olver

to the wave equation. Now, in the original ansatz (13.83), the cosine could be just as
well be a sine, and the same computation applies. Therefore, we produce two additional
solutions
x
x
,
sin t sin
.
sin t cos
c
c
Each of these four solution represents a spatially periodic standing wave form of period
2 c/, that is vibrating with frequency . Note particularly that the smaller scale waves
vibrate faster.
On the other hand, if = 0, then (13.84) has the solution v = x + , leading to the
solutions
u(t, x) = 1,
and
u(t, x) = x.
(13.85)
The first is a constant, nonvibrating solution, while the second is also constant in time,
but will typically not satisfy the boundary conditions and so can be discarded. As we
learned in Chapter 8, the existence of a zero eigenvalue corresponds to an unstable mode
in the physical system, in which the displacement grows linearly in time. In the present
situation, these correspond to the additional solutions
u(t, x) = t,

and

u(t, x) = x t,

(13.86)

both of which satisfy the wave equation. Again, the second solution will typically not
satisfy the homogeneous boundary conditions, and can usually be ignored.
The boundary conditions will serve to specify the particular eigenvalues and natural
frequencies of vibration. Consider first the case of a string of length ` with two fixed ends,
and thus subject to homogeneous Dirichlet boundary conditions
u(t, 0) = 0 = u(t, `).
This constitutes a positive definite boundary value problem, and so there is no unstable
mode. The eigenfunctions of the boundary value problem (13.84) with Dirichlet boundary
conditions v(0) = 0 = v(`) were found in (13.17), and are
vn (x) = sin

n x
`

with

n =

n c
,
`

n = 1, 2, 3, . . . .

Therefore, we can write the general solution as a Fourier sine series


X
n c t
n x
n c t
n x
bn cos
sin
+ dn sin
sin
.
u(t, x) =
`
`
`
`
n=1

(13.87)

The solution is thus a linear combination of the natural Fourier modes vibrating with
frequencies
r
n c
n
n =
=
,
n = 1, 2, 3, . . . .
(13.88)
`
`

Note that the longer the length ` of the string, or the higher its density , the slower
the vibrations, whereas increasing its stiffness or tension speeds them up in exact
accordance with physical intuition.
3/7/03

556

c 2003

Peter J. Olver

The Fourier coefficients bn and dn in (13.87) will be uniquely determined by the


initial conditions (13.80). Differentiating the series term by term, we discover that we
must represent the initial displacement and velocities as Fourier sine series
u(0, x) =

n=1

Therefore,

X
u
n x
n c
(0, x) =
sin
= g(x).
dn
t
`
`
n=1

n x
= f (x),
bn sin
`
Z

2
bn =
`

f (x) sin
0

n x
dx,
`

n = 1, 2, 3, . . . .

are the Fourier sine coefficients (11.78) of the initial displacement f (x), while
Z `
2
n x
dx,
n = 1, 2, 3, . . . .
dn =
g(x) sin
n c 0
`

are rescaled versions of the Fourier sine coefficientsof the initial velocity g(x).

Example 13.4. A string of unit length is held taut in the center and then released.
Let us see how to describe the ensuing vibrations. Let us assume the physical units are
chosen so that c2 = 1, and so we are asked to solve the initial-boundary value problem
utt = uxx ,

u(0, x) = f (x),

ut (0, x) = 0,

u(t, 0) = u(t, 1) = 0.

(13.89)

To be specific, we assume that the center of the string has been displaced by half a unit,
and so the initial displacement is
(
x,
0 x 12 ,
f (x) =
1
1 x,
2 x 1.
The vibrational frequencies are the integral multiples n = n , and so the natural modes
of vibration are
cos n t sin n x

and

sin n t sin n x

for

n = 1, 2, . . . .

Consequently, the general solution to the boundary value problem is


u(t, x) =

n=1

where
bn = 2

f (x) sin n x dx =
0

bn cos n t sin n x + dn sin n t sin n x ,

1/2

x sin n x dx =
0

4 ( 1)k
,
(2 k + 1)2 2

n = 2 k + 1,

0,
n = 2 k,
are the Fourier sine coefficients of the initial displacement, while dn = 0 are the Fourier
sine coefficients of the initial velocity. Therefore, the solution takes the form of a single
Fourier sine series
u(t, x) = 4

k=0

3/7/03

( 1)k

cos(2 k + 1) t sin(2 k + 1) x
,
(2 k + 1)2 2
557

c 2003

(13.90)
Peter J. Olver

0.4

0.4

0.4

0.2

0.2

0.2

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.2

-0.2

-0.2

-0.4

-0.4

-0.4

0.4

0.4

0.4

0.2

0.2

0.2

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.2

-0.2

-0.2

-0.4

-0.4

-0.4

Figure 13.6.

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Plucked String Solution of the Wave Equation.

and is illustrated in Figure 13.6 at times t = 0, .2, .4, .6, .8, 1.. At this point in time, the
original displacement is reproduced exactly, but upside down. The subsequent dynamics
proceeds as before, but in mirror image form. The original displacement reappears at
time t = 2, after which time the motion is periodically repeated. Interestingly, at times
tk = .5, 1.5, 2.5, . . ., the displacement is identically zero: u(t k , x) 0, although the velocity
ut (tk , x) is nonzero. The solution appears to be piecewise affine, i.e., its graph is a collection
of straight lines. This fact, which is in stark contrast to the smoothing effect of the heat
equation, will be verified in Exercise , where we construct an exact analytical formula for
this solution.
While the series form (13.87) of the solution is not entirely satisfying, we can still use it
to deduce important qualitative properties. First of all, since each term is periodic in t with
period 2 `/c, the entire solution is time periodic with that period: u(t + 2`/c, x) = u(t, x).
In fact, after half the period, at time t = `/c, the solution reduces to
u

`
,x
c

X
n x
n (` x)
=
=
= u(0, ` x) = f (` x).
(1) bn sin
bn sin
`
`
n=1
n=1
n

In general,

`
u t + ,x
c

2`
u t+
,x
c

= u(t, ` x),

= u(t, x).

(13.91)

Therefore, the initial wave form is reproduced, first as an upside down mirror image of
itself at time t = `/c, and then identical to its original form at time t = 2 `/c. This
has the important consequence that vibrations of (homogeneous) one-dimensional media
are purely periodic phenomena! There is no quasi-periodicity because the fundamental
frequencies are all integer multiples of each other.
Remark : The preceding analysis has important musical consequences. To the human
ear, sonic vibrations that are integral multiples of a single frequency are harmonic, whereas
3/7/03

558

c 2003

Peter J. Olver

those that admit quasi-periodic vibrations, with irrationally related frequencies, sound
percussive. This is why most tonal instruments rely on vibrations in one dimension, be it
a violin string, a column of air in a wind instrument (flute, clarinet, trumpet or saxophone),
a xylophone bar or a triangle. On the other hand, most percussion instruments rely on
the vibrations of two-dimensional media, e.g., drums and cymbals, or three-dimensional
media, e.g., solid blocks, which, as we shall see in Chapters 16 and 17, admit frequencies
with irrational ratios.
A bar with both ends left free, and so subject to the boundary conditions
u
u
(t, 0) = 0 =
(t, `),
x
x
will have a slightly different behavior, owing to the instability of the underlying equilibrium
equations. The eigenfunctions of (13.84) with Neumann boundary conditions v 0 (0) = 0 =
v 0 (`) are now
n x
n c
with
n =
,
n = 0, 1, 2, 3, . . . .
`
`
The result is that the solution takes the form of a Fourier cosine series


X
n x
n c t
n x
n c t
cos
+ cn sin
cos
u(t, x) = a0 + c0 t +
an cos
.
`
`
`
`
n=1
vn (x) = cos

(13.92)

In accordance with (13.85), the first two terms come from the null eigenfunction v 0 (x) = 1
with 0 = 0. The bar vibrates with the same fundamental frequencies (13.88) as in the
fixed end case, but there is now an additional unstable mode c0 t that is no longer periodic,
but grows linearly in time.
Substituting (13.92) into the initial conditions (13.80), we find the Fourier coefficients
are prescribed, as before, by the initial displacement and velocity,
Z
Z `
2
2 `
n x
n x
dx,
cn =
dx,
n = 1, 2, 3, . . . .
an =
f (x) cos
g(x) cos
` 0
`
n c 0
`
The order zero coefficients ,

1
a0 =
`

f (x) dx,

1
c0 =
`

g(x) dx,
0

are equal to the average initial displacement and average initial velocity of the string. In
particular, when c0 = 0, so that there is no net initial velocity, then the unstable mode is
not excited, and the solution is time-periodic, oscillating around the position given by the
average initial displacement. On the other hand, if c0 6= 0, then the unstable mode will be
excited. since there is nothing to restrain its motion, the bar will move off with constant
average speed c0 , while simultaneously vibrating at the same fundamental frequencies.

Note that, unlike the usual Fourier series, we have not included the
terms in (13.92).

3/7/03

559

1
2

factor in the constant

c 2003

Peter J. Olver

Similar considerations apply to the periodic boundary value problem for the wave
equation on a circular ring. The details are left as an exercise for the reader.
Forcing and Resonance
As we learned in Section 8.9, periodically forcing a discrete mechanical structure (or an
electrical circuit) at a frequency that is different from its natural vibrational frequencies
leads, in general, to a quasi-periodic response. The solution is a sum of the unforced
vibrations superimposed with an additional vibrational mode at the forcing frequency.
However, if we try forcing at one of the natural frequencies, the system may go into a
catastrophic resonance depending upon whether or not the direction of forcing is
orthogonal to the relevant eigenvector(s).
In fact, this result is of very general validity, and applies to the wave equation and
many other partial differential equations governing periodic vibrations. To keep matters
simple, we restrict our attention to the forced wave equation for a homogeneous bar
2
2u
2 u
=
c
+ F (t, x).
(13.93)
t2
x2
The external forcing function F (t, x) can depend upon both time t and position x. We
will be particularly interested in periodically forcing the system by

F (t, x) = cos t h(x),

(13.94)

where the function h(x) is specified and satisfies the boundary conditions.
As always, cf. Theorem 7.28, the solution to an inhomogeneous linear system can be
written as a linear combination,
u(t, x) = u? (t, x) + z(t, x)

(13.95)

of a particular solution u? (t, x) along with the general solution z(t, x) to the homogeneous
equation, namely
2
2z
2 z
=
c
.
(13.96)
t2
x2
The boundary conditions and initial conditions will serve to uniquely prescribe the solution
u(t, x), but there is some flexibility in its two constituents u? , z. For instance, we may ask
that the particular solution u? satisfy the homogeneous boundary conditions along with
zero (homogeneous) initial conditions, and thus represents the pure response of the system
to the forcing. The homogeneous solution z(t, x) will then reflect the effect of the initial
and boundary conditions unadulterated by the external forcing. The solution is then a
sum of the two individual responses.
In the case of periodic forcing (13.94), we try to find a particular solution
u? (t, x) = cos t v? (x)

(13.97)

that vibrates with the forcing frequency. Substituting the ansatz (13.97) into the equation (13.93), and canceling the common cosine factors, we discover that v ? (x) must satisfy
the boundary value problem prescribed by
c2 v?00 2 v? = h(x),
3/7/03

560

(13.98)
c 2003

Peter J. Olver

supplemented by the homogeneous boundary conditions (Dirichlet, Neumann, mixed, or


periodic).
At this point, there are two possibilities. If the unforced, homogeneous boundary
value problem has only the trivial solution v 0, then there is a solution to the forced
boundary value problem for any form of the forcing function h(x). On the other hand,
the homogeneous boundary value problem has a nontrivial solution v(x) if and only if
2 = is an eigenvalue, and so is a natural frequency of vibration to the homogeneous
problem; the solution v(x) is the corresponding eigenfunction appearing in the solution
series (13.87). In this case, the boundary value problem (13.98) has a solution if and only
if the forcing function h(x) is orthogonal to the eigenfunction(s):
h h ; v i = 0.

(13.99)

This result is a manifestation of the Fredholm alternative, Theorem 5.45, and the selfadjointness of the boundary value problem; see Example 10.3 and Exercise for details. If
we force in a resonant manner meaning that (13.99) is not satisfied then the solution
will have the form of a resonantly growing vibration
u? (t, x) = t sin t v? (x)
that will, if unchecked, eventually lead to a catastrophic breakdown of the system, e.g.,
the bar breaks or the string snaps.
Remark : The system may be able to handle resonance if it is sent into a regime in
which the physical and mathematical assumptions underlying the simple linear wave model
are no longer valid. For example, the large amplitude vibrations arising in resonance will be
properly modeled by some form of nonlinear partial differential equation which prevents
the catastrophic failure of the system. Or damping in the form of friction may play a
moderating influence.
There are, indeed, familiar physical systems where resonance is desirable! One example is a microwave oven. The microwaves are tuned to the resonant frequencies of water
molecules, and thus excite them into large vibrations, thereby heating up your dinner. A
wind instrument is another example; blowing into a clarinet excites the resonant frequencies in the column of air contained within the instrument, and this produces the musical
sound vibrations that we hear.
Example 13.5. As a specific example, consider the forced vibrations of a uniform
bar that is fixed at both ends:
utt = c2 uxx + cos t h(x),
u(t, 0) = 0 = u(t, 1),

u(0, x) = f (x),

ux (0, x) = g(x).

(13.100)

(We take the length ` = 1 to simplify the formulas.) The particular solution will have
the nonresonant form (13.97) provided we can find a solution v ? (x) to the boundary value
problem
(13.101)
v? (0) = 0 = v? (1).
c2 v?00 + 2 v? = h(x),
3/7/03

561

c 2003

Peter J. Olver

The resonant frequencies and corresponding eigenfunctions in this particular case are
n = n c ,

vn (x) = sin n x,

n = 1, 2, 3, . . . .

The boundary value problem (13.101) will have a solution, and hence the forcing is not
resonant, provided either 6= n is not an eigenvalue, or, = n is an eigenvalue, but
Z 1
h(x) sin n x dx
(13.102)
0 = h h ; vn i =
0

is orthogonal to the associated eigenfunction. The remaining (generic) case, where the
forcing profile is not orthogonal to the eigenfunction, induces a resonance whose amplitude
grows linearly in time.
For example, under periodic forcing of frequency with trigonometric profile h(x)
sin k x, the particular solution to (13.101) is
v? (x) =

sin k x
,
k 2 2 c2

so that

u? (t, x) =

cos t sin k x
,
2 k 2 2 c2

(13.103)

which is a valid solution as long as 6= k = k c. Note that we may allow the forcing
frequency = n to coincide with any other resonant forcing frequency, n 6= k, because
the sine profiles are mutually orthogonal and so the nonresonance condition (13.102) is
satisfied. On the other hand, if = k = k c, then the particular solution
t sin k c t sin k x
,
(13.104)
2k c
is resonant, and grows linearly in time, in precise analogy with the ordinary differential
equation case discussed in Section 8.9.
To obtain the actual solution to the initial-boundary value problem, we write u = u ? +z
where z(t, x) must satisfy
u? (t, x) =

ztt c2 zxx = 0,

z(t, 0) = 0 = z(t, 1),

along with the modified initial conditions


z(0, x) = f (x)

sin k x
,
k 2 2 c2

u
(0, x) = g(x),
x

stemming from the fact that the particular solution (13.103) has non-trivial initial data.
(In the resonant case (13.104), there is no extra term in the initial data.) As before, the
solution z(t, x) to the homogeneous equation can be written as a Fourier sine series (13.87).
The final formulae are left to the reader to complete.
Remark : One can handle inhomogeneous boundary conditions by subtracting a suitable function that interpolates the boundary conditions as in (13.27). This will, in general,
introduce an extra forcing function into the equation, which can then be solved by the
preceding techniques. In particular, forcing the boundary at a resonant frequency will,
typically, produce a resonant solution. This phenomenon is well known to children playing
jump rope!
3/7/03

562

c 2003

Peter J. Olver

13.4. dAlemberts Solution of the Wave Equation.


In the particular case of the one-dimensional wave equation, there is an alternative explicit solution formula due to the eighteenth century French mathematician Jean
dAlembert, in whose work the wave equation first appeared. DAlemberts solution avoids
the complicated Fourier series formulae, and thereby provides additional insight into the
behavior of the solutions. Unfortunately, unlike the series method that has very broad
applicability, dAlemberts approach only work for the homogeneous wave equation in a
single space variable.
The method begins by writing the wave equation (13.81) in the suggestive form
2
2 2
2
u = (t c x ) u = utt c uxx = 0.

(13.105)

Here = t2 c2 x2 is a common mathematical notation for the linear wave differential


operator , while t , x are convenient shorthands for the partial derivative operators with
respect to t and x. A key observation is that, in analogy with the elementary polynomial
factorization
t2 c2 x2 = (t c x)(t + c x),
we can factorize the second order wave operator into a product of two first order partial
diferential operators:
2
2 2
= t c x = (t c x ) (t + c x ).

(13.106)

If the second factor annihilates u, meaning


(t + c x ) u = ut + c ux = 0,

(13.107)

then u is automatically a solution to the wave equation:


u = (t c x ) (t + c x ) u = (t c x ) 0 = 0.
In other words, every solution to the simpler first order partial differential equation (13.107)
is a solution to the wave equation (13.81). (The converse is, of course, not true.)
It is relatively easy to solve linear first order partial differential equations. The
general solution to (13.107) has a particularly simple form.
Proposition 13.6. Every solution u(t, x) to the partial differential equation
u
u
+c
=0
t
x

(13.108)

u(t, x) = p(x c t),

(13.109)

has the form


where p() is an arbitrary function of the single characteristic variable = x c t.

uxy

We are employing a subscript notation for derivatives, so ut = u/t, uxx = 2 u/x2 ,


= 2 u/x y, etc.
See Chapter 21 for the nonlinear case.

3/7/03

563

c 2003

Peter J. Olver

0.5

0.5

0.5

0.2

0.4

0.6

0.8

1.2

1.4

0.2

0.4

0.6

0.8

1.2

1.4

0.2

-0.5

-0.5

-0.5

-1

-1

-1

Figure 13.7.

0.4

0.6

0.8

1.2

1.4

Traveling Wave.

Proof : We adopt a linear change of variables to write the solution


u(t, x) = p(t, x c t) = p(t, )
in terms of the characteristic variable and time. Using the chain rule, we can express the
derivatives of u in terms of the derivatives of p as follows:
p
u
=
,
x

u
p
p
=
c
,
t
t

and hence

u
p
p
p
p
u
+c
=
c
+c
=
.
t
x
t

t
Therefore, u is a solution to (13.108) if and only if p(t, ) is a solution to the very simple
partial differential equation
p
= 0.
t
This clearly implies that p = p() does not depend on the variable t, and hence
u = p() = p(x c t)
is of the desired form.

Q.E.D.

Therefore, any function of the characteristic variable, e.g., 2 + 1 or cos or e , will


produce a corresponding solution, (x c t)2 + 1 or cos(x c t) or exc t , to the first order
partial differential equation (13.108), and hence a solution to the wave equation (13.81).
The functions of the form (13.109) are known as traveling waves. At t = 0 the wave has
the initial profile u(0, x) = p(x). As t progresses, the wave moves to the right with speed
c > 0, unchanged in form; see Figure 13.7. For this reason, (13.108) is sometimes referred
to as the one-way or unidirectional wave equation. We conclude that every traveling wave
solution to the unidirectional wave equation (13.108) is a solution to the wave equation
(13.81).
Now, since c is constant, the factorization (13.106) can be written equally well in the
reverse order:
2
2 2
(13.110)
= t c x = (t + c x ) (t c x ).

More correctly, one must also assume that, at each time t, the domain of definition of p()
is connected. A similar restriction should be imposed upon the solutions in the statement of the
Proposition.

3/7/03

564

c 2003

Peter J. Olver

The same argument tells us that any solution to the alternative first order partial differential equation
u
u
c
= 0,
(13.111)
(t c x )u =
t
x
also provides a solution to the wave equation. This is a one-way wave equation with the
opposite speed c. Applying Proposition 13.6 with c replaced by c, we conclude that
the general solution to (13.111) has the form
u(t, x) = q(x + c t)

(13.112)

where q() is an arbitrary differentiable function of the alternate characteristic variable


= x + c t. The solutions (13.112) represent traveling waves moving to the left with speed
c > 0 and unchanged in form.
Thus, we have uncovered two different classes of solutions to the full wave equation
(13.81). One class consists of the traveling wave solutions moving to the right with constant
speed c, while the other class consists of the traveling wave solutions moving to the left
with the same constant speed c. Thus, the wave equation is bidirectional and has both left
and right traveling wave solutions. Of course, these solutions do not necessarily respect
the boundary conditions, which, when present, will affect the ultimate behavior.
Linearity of the wave equation implies that the sum of solutions is again a solution.
In this way, we can produce solutions which are superpositions of left and right traveling
waves. The remarkable fact, due to dAlembert, is that every solution to the wave equation
can be so represented.
Theorem 13.7. The general solution to the wave equation (13.81) is given by a
combination
u(t, x) = p() + q() = p(x c t) + q(x + c t)
(13.113)
of right and left traveling waves, depending on their respective characteristic variables
= x c t,

= x + c t.

(13.114)

Proof : The key is to use a linear changes of variables to rewrite the wave equation
entirely in terms of the characteristic variables , defined by (13.114). We set

+
,
.
u(t, x) = w(x c t, x + c t) = w(, ),
or
w(, ) = u
2
2c
Then, according to the chain rule,
w w
u
=
+
,
x

u
=c
t

w w

and hence
2w
2w
2w
2u
=
+
2
+
,
x2
2

2
3/7/03

2u
= c2
t2
565

2w
2w
2w

2
+
2

2
c 2003

Peter J. Olver

Therefore
u =

2
2
2u
2 u
2 w

c
=

4
c
.
t2
x2

We conclude that u(t, x) solves the wave equation u = 0 if and only if w(, ) solves the
second order partial differential equation

2w

w
= 0,
which we write in the form
= 0.


As in the proof of Proposition 13.6, this partial differential equation can be integrated once
with respect to , resulting in
w
= r(),

where r() is an arbitrary function. Integrating both sides of the latter equation with
respect to , we find
w(, ) = p() + q(),

where

q 0 () = r(),

and p() represents the constant of integration with respect to . Replacing the characteristic variable by their formulae in terms of x amd t completes the proof.
Q.E.D.
Remark : As above, we have been a little cavalier with our specification of the domain
of definition of the functions and the differentiability assumptions required. Sorting out
the precise technical details is not difficult.
Remark : The general solution to a second order ordinary differential equation depends
on two arbitrary constants. The general solution to a second order partial differential
equation typically depends on two arbitrary functions in this case p() and q().
Let us now see how this new form of the wave equation solution can be used to
effectively solve initial value problems. The simplest case is that of a bar or string of
infinite length, in which case we have a pure initial value problem
2
2u
2 u
=
c
,
t2
x2

u(0, x) = f (x),

u
(0, x) = g(x),
t

< x < .

(13.115)

The only boundary conditions are that the solution remain bounded: | u(t, x) | M .
Substituting the solution formula (13.113) into the initial conditions, we find
u
(0, x) = c p0 (x) + c q 0 (x) = g(x).
t
To solve this pair of linear equations for p and q, we differentiate the first equation:
u(0, x) = p(x) + q(x) = f (x),

p0 (x) + q 0 (x) = f 0 (x).


Subtracting the second equation divided by c, we find
2 p0 (x) = f 0 (x)
3/7/03

566

1
g(x).
c
c 2003

Peter J. Olver

Therefore,
1
1
p(x) = f (x)
2
2c

g(z) dz + a,
0

where a is an integration constant. The first equation then yields


Z x
1
1
q(x) = f (x) p(x) = f (x) +
g(z) dz a.
2
2c 0
Substituting these two expressions back into (13.113), we find
" Z
Z #

f () + f ()
1
u(t, x)= p() + q() =
+

+
g(z) dz
2
2c
0
0
Z
1
f () + f ()
+
g(z) dz,
=
2
2c
where , are the characteristic variables (13.114). In this fashion, we have derived
dAlemberts solution to the wave equation on the entire line < x < .
Theorem 13.8. The solution to the initial value problem
2
2u
2 u
=
c
,
t2
x2
is given by

u
(0, x) = g(x),
t

u(0, x) = f (x),

1
f (x c t) + f (x + c t)
+
u(t, x) =
2
2c

< x < .

(13.116)

g(z) dz.

(13.117)

x+c t
xc t

Let us investigate the implications of dAlemberts formula (13.117). First, suppose


there is no initial velocity, so g(x) 0, and the motion is purely the result of the initial
displacement u(0, x) = f (x). In this case, the solution (13.117) reduces to
u(t, x) =

1
2

1
2

f (x c t) +

f (x + c t).

The basic effect is that the initial displacement f (x) splits into two waves, one traveling
to the right and one traveling to the left, each with exactly the same shape as the initial
displacement f (x), but only half as tall. For example, if the initial displacement is a
localized pulse, centered at the origin, say,
u
(0, x) = 0,
t

u(0, x) = e x ,
then the solution
u(t, x) =

1
2

e (xc t) +

1
2

e (x+c t)

consists of two half size copies of the initial pulse running away from the origin in opposite
directions with equal speed c. If we take two separated pulses, say
2

u(0, x) = e x + 2 e (x1) ,
3/7/03

567

u
(0, x) = 0,
x
c 2003

Peter J. Olver

0.5

0.5

0.5

0.5

1.5

0.5

1.5

-0.5

-0.5

-0.5

-1

-1

-1

0.5

0.5

0.5

0.5

1.5

0.5

1.5

-0.5

-0.5

-0.5

-1

-1

-1

0.5

1.5

0.5

1.5

Interaction of Waves.

Figure 13.8.

centered at x = 0 and x = 1, then the solution


u(t, x) =

1
2

e (xc t) + e (x1c t) +

1
2

e (x+c t) + e (x1+c t)

will consist of four pulses, two moving to the right and two to the left, all with the same
speed, as pictured in Figure 13.8.
Remark : If the initial displacement has compact support, and so f (x) = 0 if x < a
or x > b for some a < b, then after a short time the right and left-moving waves will
completely disengage and the observer will see two half size displacements running away,
with speed c, in opposite directions. If the displacement is not localized, then the two waves
will never fully disengage, and one might be hard pressed (just as in our earlier discussion
of quasi-periodic phenomena) in recognizing that a complicated solution pattern is, in
reality, just the superposition of two very simple traveling wave solutions.
An important observation is that when a right-moving pulse collides with a left-moving
pulse, they emerge from the collision unchanged a consequence of the linearity of the
wave equation. The first picture shows the initial displacement. In the second and third
pictures, the two localized bumps have each split into two copies moving in opposite
directions. In the fourth and fifth, the larger right moving bump is in the process of
interacting with the smaller left moving bump. Finally, in the last picture the interaction
is complete, and the two left moving bumps and two right moving bumps travel in tandem
with no further collisions.
The lines in the (t, x)plane where the characteristic variables are constant,
= x c t = a,

= x + c t = b,

(13.118)

have slope c, and are known as the characteristics of the wave equation. The two
characteristics emanating from a point on the x axis, where the initial data is prescribed,
is illustrated in Figure 13.9. The reader should note that, in this figure, the t axis is
horizontal, while the x axis is vertical.
3/7/03

568

c 2003

Peter J. Olver

3
2.5
2
1.5
1
0.5
-1

Figure 13.9.

Characteristic Lines for the Wave Equation.

In general, signals propagate along characteristics. More specifically, if we start out


with an initial displacement concentrated very close to a point x = a, then the solution will
be concentrated along the two characteristic lines through the point x = a, t = 0, namely
x c t = a and x + c t = a. In the limit, a unit impulse or delta function displacement at
x = a, corresponding to the initial condition
u(0, x) = (x a),

u
(0, x) = 0,
t

(13.119)

(x + c t a)

(13.120)

will result in a solution


u(t, x) =

1
2

(x c t a) +

1
2

consisting of two half-strength delta spikes traveling away from the starting position along
the two characteristic lines.
Let us now return to the general initial value problem (13.116). Suppose that there is
no initial displacement, u(0, x) = f (x) 0, but rather a concentrated initial velocity, say
a delta function
u
(0, x) = a (x) = (x a).
t
Physically, this would correspond to striking the string at the point x = a. The dAlembert
solution (13.117) is then

Z x+c t
1 ,
x c t < a < x + c t,
1
2c
a (z) dz =
(13.121)
u(t, x) =

2 c xc t
0,
otherwise.

Thus, the solution consists of a constant displacement, of magnitude 1/(2 c), between the
two characteristic lines x c t = a = x + c t emanating from the point x = a, t = 0
the shaded region of Figure 13.9. The solution has two jump discontinuities between the
undisturbed state and the displaced state, each propagating along its characteristic line
with speed c, but in opposite directions. The solution is plotted in Figure 13.10. Note
that, unlike a concentrated initial displacement, where the signal remains concentrated and
each point along the bar is displaced for a while, but eventually returns to its undisturbed
state, a concentrated initial velocity has a lasting effect, and the bar remains permanently
displaced by an amount 1/(2 c) .
3/7/03

569

c 2003

Peter J. Olver

1.2

1.2

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4

0.2

0.2

0.2

0.2

0.4

0.6

0.8

1.2

1.4

-0.2

1.2

0.2

0.4

0.6

0.8

1.2

1.4

-0.2

Figure 13.10.

0.2

0.4

0.6

0.8

1.2

1.4

-0.2

Concentrated Initial Velocity for Wave Equation.

Solutions on Bounded Intervals


So far, we have been looking at the solutions to the initial value problem for the
wave equation on an infinite interval. If we restrict to a bounded interval and impose
boundary conditions, then the dAlembert formula can still be applied, but in a suitably
modified format. The easiest is the periodic case. If the boundary conditions are periodic
on 0 x `, then one extends to the initial displacement and velocity, f (x) and g(x),
to also be periodic of period `. If the initial velocity has mean zero, then the resulting
dAlembert solution (13.117) will remain periodic. Otherwise, the solution will not be
periodic in time, owing to the excitation of the unstable mode. See Exercise for details.
If we have fixed (Dirichlet) boundary conditions, say
u(t, 0) = 0,

u(t, `) = 0,

(13.122)

then, motivated by the fact that the solution can be written as a Fourier sine series (13.87),
one takes the initial displacement f (x) and velocity g(x) and extends them to be odd,
periodic functions of period 2 `:
f ( x) = f (x),

f (x + 2 `) = f (x),

g( x) = g(x),

g(x + 2 `) = g(x).

This will ensure that the dAlembert solution also remains odd, periodic, and hence the
boundary conditions (13.122) remain valid for all t. Keep in mind that, while the solution
u(t, x) is defined for all x, the only physically relevant values occur on the interval 0 x `.
Nevertheless, the effects of displacements in the nonphysical regime will eventually be felt
as the propagating waves pass through the physical interval.
For example, consider an initial displacement which is concentrated near x = a for
some 0 < a < `. Its odd, periodic extension consists of two sets of replicas: those of the
same form occurring at positions a 2 `, a 4 `, . . . , and mirror image versions, resulting
from the oddness of the function, at intermediate positions a, a 2 `, a 4 `, . . . ; see
Figure 13.12. The resulting solution begins by each of the pulses, positive and negative,
splitting into two half-size replicas that propagate with speed c in opposite directions.
As the individual pulses meet, they interact as they pass through each other, eventually
emerging unchanged. The process repeats periodically, with infinite rows of pulses moving
to the right continually interacting with infinite rows moving to the left.
However, only the part of this solution that lies on 0 x ` is actually realized on
the physical bar. The net effect is as if we were viewing the solution passing by a stationary
window, of length `, that blocks out all other regions of the real axis. What the viewer
effectively sees assumes a somewhat different interpretation. Namely, the original pulse at
3/7/03

570

c 2003

Peter J. Olver

0.5

0.5

0.5

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.5

-0.5

-0.5

-1

-1

-1

0.5

0.5

0.5

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.5

-0.5

-0.5

-1

-1

-1

Figure 13.11.

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Solution to Wave Equation with Fixed Ends.


1
0.5

-4

-2

Figure 13.12.

-0.5
-1

Odd Periodic Extension of a Concentrated Pulse.

position 0 < a < ` splits up into two half size replicas that move off in opposite directions.
As each half-size pulse reaches an end of the bar, it meets a mirror image pulse that has
been propagating in the opposit direction from the non-physical regime. The effect is that
the pulse appears to be reflected at the end of the interval, and changes into an upside
down mirror image of itself moving in the opposite direction. The original positive pulse
has moved off the end of the bar just as its mirror image has moved into the physical
regime. A good physical illustration is a wave propagating down a jump rope that is held
fixed at its end; the reflected wave is upside down. A similar reflection occurs as the other
half-size pulse hits the other end of the physical interval, after which the solution consists
of two upside down half-size pulses moving back towards each other. At time t = `/c
they recombine at the point ` a to instantaneously form a full-sized, but upside-down
mirror image of the original disturbance, in accordance with (13.91). This in turn splits
apart into two upside down bumps that, when they collide with the ends, reflect and
become right side up. At time t = 2 `/c they recombine to exactly reproduce the original
displacement. The process then repeats, and the solution is periodic in time with period
2 `/c. In Figure 13.11, the first picture gives the initial displacement, which splits into left
and right moving, half-size clones. In the third picture, the left moving bump is in the
process of emerging from its collision with the end. In the fourth picture, it has emerged
from its collision with the end, and is now upside down, reflected, and moving to the right.
Meanwhile, the right moving pulse is starting to collide with the right hand end. In the
fifth picture, both pulses have completed their collisions and are now moving back towards
each other, where, in the last picture, they recombine into an upside-down version of the
original pulse. The process then repeats itself in mirror image, finally recombining to the
3/7/03

571

c 2003

Peter J. Olver

original pulse after the same length of time.


The Neumann (free) boundary value problem
u
u
(13.123)
(t, 0) = 0,
(t, `) = 0,
x
x
is handled similarly. Here, inspired by the Fourier cosine series form of the solution, one
extends the initial conditions to be even, 2 ` periodic functions
f ( x) = f (x),

f (x + 2 `) = f (x),

g( x) = g(x),

g(x + 2 `) = g(x).

If the initial velocity has mean zero,


1
c0 =
`

g(x) dx = 0,

(13.124)

then the solution remains periodic of period 2 `/c. In this case, when a bump hits one of
the ends, the reflected bumps remains upright, but a mirror image of the original traveling
in the opposite direction. A familiar physical illustration is a water wave that reflects off
a solid wall. After an elapsed time of t = `/c, the individual reflected bumps recombine to
form a positive mirror image of the original displacement, i.e., u(t + `/c, x) = u(t, ` x).
After a further time lapse of t = 2 `/c, the original displacement reappears, and the solution
is time periodic with period 2 `/c, i.e., u(t + 2 `/c, x) = u(t, x). On the other hand, if there
is a net initial velocity, so c0 6= 0, then, as noted above, the solution is no longer periodic,
but is a linear combination of periodic vibrations with the steadily increasing unstable
mode c0 t.
In summary, we have now learned two different versions of the solution to the onedimensional wave equation. The first, based on Fourier analysis, emphasizes the vibrational
or wave character of the solutions. The second, based on the dAlembert formula, emphasizes the particle aspects of the solutions, where individual wave packets collide with each
other, or reflect at the boundary, but maintain their overall form. Some solutions look like
vibrating waves, while others are much more like interacting paticles. The Fourier series
shows how every particle-like solution can be decomposed into its constituent vibrational
modes, while the dAlembert formula shows how vibrating waves can be viewed as moving
particles.
The coexistence of particle and wave features is reminiscent of the long running historical debate over the nature of light, with Newton and his disciples advocating its particle
basis in the form of photons, while until the beginning of the twentieth century most
physicists advocated the wave and vibrational viewpoint. Einsteins explanation of the
photoelectric effect served to resurrect the particle interpretation of light. Only with the
establishment of quantum mechanics was the debate resolved light, and, indeed, all
subatomic particles are both, manifesting both particle and wave features depending upon
the experiment and the physical situation. But the evidence for a wave-particle duality
already existed in the classical wave equation!

13.5. Numerical Methods.


As we know, most differential equations are too complicated to solved analytically,
and so one is usually forced to resort to numerical solution methods. Even in cases, like the
3/7/03

572

c 2003

Peter J. Olver

heat and wave equations, where explicit solution formulas (either closed form or infinite
series) exist, the numerical methods still can be profitably applied to solve particular initialboundary value problems. Moreover, verification that the numerical algorithm produces a
reasonable approximation to the true solution is much easier if one has an alternative solution formula in hand. The lessons learned in the design of numerical algotihms for solved
problems prove to be of immense value when one is confronted with more complicated
problems for which solution formulas no longer exist.
In this final section we present some of the most basic numerical solution techniques
for the heat and wave equations. We just consider the simplest cases, leaving variations
and extensions to a more thorough treatment as found in basic numerical analysis texts,
[23].
Numerical solution methods for differential equations can be partitioned into two principal classes. (In this oversimplified presentation, we are leaving out more specialized methods of less general applicability.) The first category, already introduced in Section 10.6,
are the finite element methods. Finite elements are designed for the differential equations
describing equilibrium configurations, since they rely on minimizing a functional. The alternative approach is to directly approximate the derivatives appearing in the differential
equation, through use of numerical differentiation formulae. In general, to approximate
the derivative of a function, one constructs a suitable combination of sampled function
values at nearby points. The underlying formalism used to construct these approximation
formulae is know as the calculus of finite differences, and has a long history, dating back
to Newton, that includes many prominent mathematicians in its development and application. The resulting finite difference methods have extremely broad applicability, and can,
with proper care, be designed to solve most differential equations arising in mathematics,
physics, engineering, biology, finance, and elsewhere.
Finite Differences
In this section, we give a brief introduction to the most basic finite difference approximations for derivatives of a function of one variable. In this presentation, we concentrate
on the simplest version of the calculus of finite differences, based on equally spaced sample points. In the exercises, the reader is asked to generalize the difference formulae to
non-equally spaced points.
The simplest finite difference approximation is the ordinary difference quotient
u(x + h) u(x)
u0 (x),
(13.125)
h
used to approximate the first derivative of the function u(x). Throughout our discussion,
the step size h, which may be either positive or negative, is assumed to be small: | h | 1.
The difference quotient can be interpreted as a linear combination of the sampled function
values at the two nearby points x and x + h. Geometrically, the difference quotient equals
the slope of the secant line through the two points (x, u(x)) and (x + h, u(x + h)) on the
graph of the function. For small h, this should be a reasonably good approximation to the
slope of the tangent line, as illustrated in the first picture in Figure 13.13. Indeed, if u is
differentiable at x, then u0 (x) is, by definition, the limit, as h 0 of the finite difference
quotients.
3/7/03

573

c 2003

Peter J. Olver

One-Sided Difference
Figure 13.13.

Central Difference
Finite Difference Approximations.

How close an approximation is (13.125)? To answer this question, we use the first
order Taylor expansion
u(x + h) = u(x) + u0 (x) h +

1
2

u00 () h2 ,

(13.126)

where we assume that u(x) is at least twice continuously differentiable. Here represents a
point lying between x and x+h, which follows from the Cauchy form of the remainder term
(C.2) in the Taylor expansion; see Appendix C for full details. Therefore, the difference
quotient is given by the formula
u(x + h) u(x)
= u0 (x) +
h

1
2

u00 () h.

The error is the difference between the finite difference formula and the derivative being
approximated, namely
u(x + h) u(x)
u0 (x) =
h

1
2

u00 () h.

(13.127)

We say that the finite difference approximation (13.127) is first order because the error
is proportional to h. Indeed, the error can be bounded by 12 M h, where | u00 | < M is an
overall bound on the second derivative of the function near the point x. If the precise
formula for the error is not so important, we will write
u0 (x) =

u(x + h) u(x)
+ O(h).
h

(13.128)

The big Oh notation O(h) refers to a term proportional to h, or, more correctly, a term
that is bounded by a constant multiple of h as h 0.
Example 13.9. Let u(x) = sin x. Let us compute u0 (1) = cos 1 = 0.5403023 . . . by
using the finite difference quotient (13.125), and so
cos 1

sin(1 + h) sin 1
.
h

The result for different values of h is listed in the following table.


3/7/03

574

c 2003

Peter J. Olver

approximation
error

.1

.01

.001

.0001

0.067826

0.497364

0.536086

0.539881

0.540260

0.472476

0.042939

0.004216

0.000421

0.000042

1
reduces the size of the error by
We observe that reducing the step size by a factor of 10
approximately the same factor. Thus, to obtain 10 decimal digits of accuracy, we anticipate
needing a step size of about h = 1011 . The fact that the error is more of less proportioal
to the step size tells us that we are using a first order numerical approximation.

To approximate higher order derivatives, we need to evaluate the function at more


than two points. In general, an approximation to the nth order derivative u(n) (x) requires
at least n+1 distinct sample points. For example, let us try to approximate u 00 (x) by using
the particular sample points x, x + h and x h. Which combination of the function values
u(x h), u(x), u(x + h) can be used to approximate the derivative u00 (x)? The answer to
such a question can be found by consideration of the relevant Taylor expansions
h2
h3
+ u000 (x)
+ O(h4 ),
2
6
3
2
h
h
u000 (x)
+ O(h4 ),
u(x h) = u(x) u0 (x) h + u00 (x)
2
6

u(x + h) = u(x) + u0 (x) h + u00 (x)

(13.129)

where the error terms are proportional to h4 . Adding the two formulae together gives
u(x + h) + u(x h) = 2 u(x) + u00 (x) h2 + O(h4 ).
Rearranging terms, we conclude that
u00 (x) =

u(x + h) 2 u(x) + u(x h)


+ O(h2 ),
h2

(13.130)

The result is is the simplest finite difference approximation to the second derivative of a
function. The error is of order h2 , and depends upon the magnitude of the fourth order
derivative of u near x; see Exercise .
2

Example 13.10. Let u(x) = ex , with u00 (x) = (4 x2 + 2) ex . Let us approximate


00
u (1) = 6 e = 16.30969097 . . . by using the finite difference quotient (13.130):
2

e(1+h) 2 e + e(1h)
6e
.
h2
The results are listed in the following table.
h

.1

.01

.001

.0001

approximation

50.16158638

16.48289823

16.31141265

16.30970819

16.30969115

error

33.85189541

0.17320726

0.00172168

0.00001722

0.00000018

3/7/03

575

c 2003

Peter J. Olver

1
redues the size of the error by a factor of
Each reduction in step size by a factor of 10
1
100 and a gain of two new decimal digits of accuracy, which is a refelction of the fact
that the finite difference formula (13.130) is of second order, with error proportional to h 2 .
However, this prediction is not entirely borne out in practice. If we take h = .00001 then
the formula produces the approximation 16.3097002570, with an error of 0.0000092863
which is less accurate that the approximation with h = .0001. The problem is that roundoff errors have now begun to affect the computation, and underscores a significant difficulty
with numerical differentiation formulae. Such finite difference formulae involve dividing
very small quantities, and this can lead to high numerical errors due to round-off. As
a result, while they typically produce reasonably good approximations to the derivatives
for moderately small step sizes, to achieve high accuracy, one must employ high precision
arithmetic. A similar comment applied to the previous Example 13.9, and our expectations
about the error for a very small step size were not, in fact justified as the reader may have
discovered.

We can improve the order of accuracy of finite difference approximations to derivatives


by employing more sample points to form an appropriate linear combination of the function
values. For instance, if the first order approximation (13.128) to the first derivative based
on the two points x and x + h is not sufficiently accurate, one can try combining the
function values at three points x, x + h and x h. To find the appropriate combination of
u(x h), u(x), u(x + h), we return to the Taylor expansions (13.129). To solve for u 0 (x),
we subtract the two formulae, and so
h3
+ O(h4 ).
3
Rearranging the terms, we are led to the well-known centered difference formula
u(x + h) u(x h) = 2 u0 (x) h + u000 (x)

u(x + h) u(x h)
+ O(h2 ),
(13.131)
2h
which is a second order approximation to the first derivative. Geometrically, the centered difference quotient represents the slope of the secant line through the two points
(x h, u(x h)) and (x + h, u(x + h)) on the graph of u centered symmetrically about
the point x. Figure 13.13 illustrates the geometry behind the two approximations; the
advantages in accuracy in the centered difference version are graphically evident. Higher
order approximations can be found by evaluating u at additional points, including, say,
x + 2 h, x 2 h, and so on.
u0 (x) =

Example 13.11. Return to the function u(x) = sin x considered in Example 13.9.
The centered difference approxiomation to its derivative u0 (1) = cos 1 = 0.5403023 . . . is
cos 1

sin(1 + h) sin(1 h)
.
2h

The results are tabulated as follows:

The terms O(h4 ) do not cancel, since they represent potentially different multiples of h4 .

3/7/03

576

c 2003

Peter J. Olver

h
approximation
error

.1

.01

.001

.0001

0.53940225217

0.54029330087

0.54030221582

0.54030230497

0.00090005370

0.00000900499

0.00000009005

0.00000000090

As advertized, the results are much more accurate than the one-sided finite difference
approximation used in Example 13.9 at the same step size. As in Example 13.10, we see
1
adds two more decimal places of
that each reduction in the step size by a factor of 10
accuracy, which is a consequence of the second order accuracy in the centered difference
approximation.
Many more finite difference formulae can be constructed by similar manipulations of
Taylor expansions, but these will suffice for our purposes. Let us now apply these basic
formulas to construct numerical solution algorithms for the heat and wave equations.
Numerical Solution Methods for the Heat Equation
Consider the heat equation
u
2u
,
=
t
x2

0 < x < `,

(13.132)

t 0,

on a bar of length `, where > 0 represents the thermal diffusivity, which is assumed to
be constant. To be specific, we impose Dirichlet boundary conditions
u(t, 0) = (t),

u(t, `) = (t),

t 0.

(13.133)

at the ends of the bar, along with the initial conditions


u(0, x) = f (x),

0 x `.

(13.134)

In order to effect a numerical approximation to the solution to this initial-boundary value


problem, we begin by introducing a rectangular mesh consisting of points (t i , xj ) with
0 = x0 < x1 < < xn = ` and 0 = t0 < t1 < t2 < . For simplicity, we maintain a
fixed, regular mesh spacing, with
h = xj+1 xj =

`
,
n

k = ti+1 ti ,

representing, respectively, the spatial mesh size and the time step size. It is important
that the two step sizes are not necessarily the same. Note that
ti = i k,

xj = j h.

We shall use the notation


ui,j u(ti , xj )

(13.135)

to denote our numerical approximation to the value of the solution at a given mesh point.
3/7/03

577

c 2003

Peter J. Olver

As a first try at designing a numerical method, we shall use the simplest finite difference approximations to the derivatives. The second order space derivative is approximated
by (13.130), and hence
u(ti , xj+1 ) 2 u(ti , xj ) + u(ti , xj1 )
2u
(t
,
x
)

+ O(h2 )
i
j
x2
h2
ui,j+1 2 ui,j + ui,j1
+ O(h2 ),

h2

(13.136)

where the error in the approximation is proportional to h2 . Similarly, the one-sided finite
difference approximation (13.128) is used for the time derivative, and so
u(ti+1 , xj ) u(ti , xj )
ui+1,j ui,j
u
(ti , xj )
+ O(k)
+ O(k),
t
k
k

(13.137)

where the error is proportion to k. In practice, it is important to ensure that the approximations have similar orders of accuracy, which tells us to choose
k h2 .
Assuming the step size h < 1, this requirement has the important consequence that the
time steps must be much smaller than the space mesh size.
Remark : At this stage, the reader might be tempted to replace (13.137) by the second order central difference approximation (13.131). However, this produces significant
complications in the implementation of the method, and is not suitable for a practical
numerical algorithm for the heat equation. We shall subsequently see how to construct a
practical numerical method that is second order in the time step k.
Substituting equations (13.136), (13.137) into the partial differential equation (13.138),
and rearranging terms, we find

where

ui+1,j = ui,j + ui,j+1 2 ui,j + ui,j1 ,


=

i = 0, 1, 2, . . . ,
j = 1, . . . , n 1,

k
.
h2

(13.138)

(13.139)

The numerical scheme takes the form of an iterative linear system for the solution values
ui,j u(ti , xj ) at each time step ti .
The initial condition (13.134) means that we should initialize our numerical data by
sampling the initial temperature at the mesh points:
u0,j = fj = f (xj ),

j = 1, . . . , n 1.

(13.140)

Similarly, the boundary conditions (13.133) require that


ui,0 = i = (ti ),
3/7/03

ui,n = i = (ti ),
578

i = 0, 1, 2, . . . .
c 2003

(13.141)
Peter J. Olver

In addition, we assume consistency of the initial and boundary conditions at the corners
of the domain:
f0 = f (0) = u(0, 0) = (0) = 0 ,

fn = f (`) = u(0, `) = (0) = 0 .

The three equations (13.138), (13.140), (13.141) completely prescribe the numerical approximation scheme for solving the initial-boundary value problem (13.132), (13.133), (13.134)
for the heat equation.
Let us rewrite this discrete dynamical system in a more transparent matrix form.
First, let

T
T
u(i) = ui,1 , ui,2 , . . . , ui,n1
u(ti , x1 ), u(ti , x2 ), . . . , u(ti , xn1 )

(13.142)

be the vector whose entries are the numerical approximations to the solution values at the
interior nodes omitting the boundary nodes x0 = 0, xn = `, where the values of u are
fixed by the boundary conditions (13.133). Then (13.138) takes the matrix form
u(i+1) = A u(i) + b(i) ,

(13.143)

where

1 2

A=

1 2

1 2

..

..

..

..

1 2

b(i)

i
0

=
... .

(13.144)

The coefficient matrix A is symmetric and tridiagonal. The contributions (13.141) of the
boundary nodes are found in the vector b(i) . This numerical method is known as an
explicit scheme since each iterate is computed explicitly without relying on solving an
auxiliary equation unlike the implicit schemes discussed below. The method is not
guaranteed to work, and indeed does not unless the mesh sizes are chosen appropriately.
Example 13.12. Let us fix the diffusivity = 1 and the bar length ` = 1. For
illustrative purposes, we fix the spatial step size to be h = .1. In Figure 13.14 we compare
two (slightly different time step sizes on the same initial data as used in (13.22). The first
sequence takes k = h2 = .01 and plots the solution at times t = 0., .02, .04. The solution
is already starting to show signs of instability, and indeed soon becomes completely wild.
The second sequence takes k = .005 and plots the solution at times t = 0., .025, .05. (Note
that we are using different vertical scales for the two sequences of plots.) Even though we
are employing a rather coarse mesh, the numerical solution is not too far away from the
true solution to the initial value problem, which can be found in Figure 13.1.
In order to understand the underlying issues, let us concentrate on homogeneous
boundary conditions
u(t, 0) = 0 = u(t, `)
3/7/03

579

c 2003

Peter J. Olver

0.5

0.5

0.5

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.5

-0.5

-0.5

-1

-1

-1

0.2

0.2

0.2

0.1

0.1

0.1

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

-0.1

-0.1

-0.1

-0.2

-0.2

-0.2

Figure 13.14.

0.2

0.4

0.6

0.8

0.2

0.4

0.6

0.8

Numerical Solutions for the Heat Equation Based on the Explicit


Scheme.

whereby i = i = 0 for all i and so (13.143) reduces to a homogeneous, linear iterative


system
u(i+1) = A u(i) .
(13.145)
The solution will converge to zero, u(i) 0, as it is supposed to (why?), if and only if
A is a convergent matrix. But convergence will depend on the step sizes. For instance, if
= 1, choosing a spatial step size of h = .1 and a time step size of k = h 2 = .01 gives a
non-convergent matrix, and an invalid numerical scheme, while a much smaller step size,
e.g., k = .0005, gives a convergent matrix and a valid numerical scheme.
As we learned in Chapter 9, the convergence property of a matrix is fixed by its
spectral radius, i.e., the largest eigenvalue in magnitude; see Theorem 9.12. There is, in
fact, an explicit formula for the eigenvalues of the particular tridiagonal matrix (13.144).
Lemma 13.13. The eigenvalues of the (n 1) (n 1) matrix A in (13.144) are

k
,
k = 1, . . . , n 1.
2n
A proof of this fact, including an explicit formula for the associated eigenfunctions, is
outlined in Exercise . The matrix is convergent if and only if all its eigenvalues are less
than 1 in absolute value. Here, convergence requires

k
2
1 4 sin
< 1,
for all
k = 1, . . . , n 1.

2n
k = 1 4 sin2

Since 0 sin2 x 1, the convergence inequality will be valid as long as


| | < 21 .

In this way, we have deduced the basic stability criterion for the linear iterative system
(13.145). Referring to the formula (13.139), we find the condition
k
1
< ,
2
h
2
3/7/03

or
580

k<

h2
,
2

(13.146)
c 2003

Peter J. Olver

required for the coefficient matrix to be convergent.


As a result, this numerical method is called conditionally stable, which means that
not all choices of space and time steps lead to a convergence scheme. The convergence
criterion (13.146) places a rather severe restriction on the time step size. For instance, if
we have h = .01, and = 1, then we can only use a time step size of k < .00005, which
is minuscule. It would take a huge number of time steps to compute the value of the
solution at even a moderate times, e.g., t = 1. Moreover, owing to the limited accuracy
of computers, the propagation of round-off errors might then become a significant issue in
reducing the overall accuracy of the final solution values.
An unconditionally stable method one that does not restrict the time step can
be constructed by using the backwards difference formula
u(ti , xj ) u(ti1 , xj )
u
+ O(hk )
(ti , xj )
t
k

(13.147)

to the temporal derivatives instead. Substituting (13.147) and the same approximation
(13.136) for uxx into the heat equation, and then replacing i by i + 1, leads to the iterative
system

ui+1,j ui+1,j+1 2 ui,j + ui+1,j1 = ui,j ,

i = 0, 1, 2, . . . ,
j = 1, . . . , n 1,

(13.148)

where the parameter = k/h2 is as above. The initial and boundary conditions also
have the same form (13.140), (13.141). The system has the matrix form
b u(i+1) = u(i) + b(i+1) ,
A

(13.149)

b is obtained from the matrix A in (13.144) by replacing by . This defines


where A
an implicit method since we have to solve a tridiagonal linear system at each step in order
to compute the next iterate u(i+1) . However, as we learned in Section 1.7, a tridiagonal
linear system can be solved quite rapidly, and so this does not become a significant issue
in the practical implementation.
Let us look at the convergence of the implicit scheme. Taking homogeneous Dirichlet
boundary conditions, the system becomes
b1 u(i) ,
u(i+1) = A

b1 . Lemma 13.13 tells us that


and the convergence is now governed by the eigenvalues of A
b
the eigenvalues of A are
k = 1 + 4 sin2

k
,
2n

b1 has eigenvalues
As a result, its inverse A
1
=
k

3/7/03

k
1 + 4 sin
2n

581

k = 1, . . . , n 1.

k = 1, . . . , n 1.
c 2003

Peter J. Olver

b is a convergent
Since > 0, the latter are always less than 1 in absolute value, and so A
matrix for any > 0. Therefore, the implicit scheme (13.149) is convergent for any choice
of step sizes h, k.
Compute previous example
An even better numerical scheme is obtained by averaging the explicit and implicit
schemes (13.138), (13.148). The result is known as the CrankNicholson scheme, and takes
the form

ui+1,j+1 2 ui+1,j + ui+1,j1 + ui,j+1 2 ui,j + ui,j1 . (13.150)


ui+1,j ui,j =
2
We can write the system in matrix form

B u(i+1) = C u(i) +
where

1+
12

B=

12
1 + 12
..
.
21
..
.

1
2

b(i) + b(i+1) ,

1
21

C=

..
,
.

..
.

1
2

1
1
2

1
2

..

..

..
.
.

..
.

(13.151)

The convergence is governed by the eigenvalues of B 1 C. According to Exercise , these


are
k
1 2 sin2
2n ,
k = 1, . . . , n 1.
k =
2 k
1 + 2 sin
2n
If > 0, are these eigenvalues are less than 1 in absolute value, so that the CrankNicholson
scheme is also unconditionally stable. A detailed analysis based on the Taylor expansions
will show that the errors are of the order of k 2 and h2 , and so it is reasonable to choose
the time step to have the same order of magnitude as the space step, k h. This gives
the CrankNicholson scheme a considerable advantage over the other two schemes.
Example 13.14.
Numerical Solution Methods for the Wave Equation
Let us now look at numerical solution techniques for the wave equation. Although this
is in a sense unnecessary, owing to the explicit dAlembert formula (13.117) for the solution,
the experience we gain in designing a suitable method will serve us well in more complicated
situations, when there is no explicit formula, including one-dimensional inhomogeneous
media, and higher dimensional problems.
Consider the wave equation
2
2u
2 u
=
c
,
t2
x2

3/7/03

0 < x < `,
582

(13.152)

t 0,
c 2003

Peter J. Olver

on a homogeneous bar of length ell with constant wave speed c > 0. To be specific, we
impose Dirichlet boundary conditions
u(t, 0) = (t),

u(t, `) = (t),

and initial conditions


u
(0, x) = g(x),
t
We adopt the same uniformly spaced mesh
u(0, x) = f (x),

ti = i k,

t 0.

(13.153)

0 x `.

(13.154)

xj = j h,

where h = `/n, as in the heat equation.


In order to discretize the wave equastion, we replace the second order derivatives by
their standard finite difference approximations (13.130), namely
u(ti+1 , xj ) 2 u(ti , xj ) + u(ti1 , xj )
2u
(ti , xj )
+ O(h2 ),
2
t
k2
(13.155)
u(ti , xj+1 ) 2 u(ti , xj ) + u(ti , xj1 )
2u
2
(t , x )
+ O(k ),
x2 i j
h2
Since the errors are of orders of k 2 and h2 , we expect to be able to choose the space and
time step sizes of comparable magnitude:
k h.

Substituting the finite difference formulae (13.155) into the partial differential equation
(13.152), and rearranging terms, we are led to the iterative system
i = 1, 2, . . . ,
(13.156)
j = 1, . . . , n 1,
u(ti , xj ), with parameter

ui+1,j = ui,j + 2 ui,j+1 + 2 (1 2 ) ui,j + 2 ui,j1 ui1,j ,


for the numerical approximations ui,j

ck
> 0.
h
The boundary conditions (13.153) require that
=

ui,0 = i = (ti ),

(13.157)

ui,n = i = (ti ),

i = 0, 1, 2, . . . .

(13.158)

This allows us to rewrite the system in matrix form

where

u(i+1) = B u(i) u(i1) + b(i) ,

2 (1 2 )
2
2

2 (1 2 ) 2

..
..

B=
.
.
2

.
.

..
..

3/7/03

2
2 (1 2 )
583

, u(j)

(13.159)

2
u1,j
j
u2,j

.
.
(j)
.
= .. , b =
. .

0
u

n2,j
2 j
un1,j
(13.160)
c 2003

Peter J. Olver

The entries ui,j of u(i) are, as in (13.142), the numerical approximations to the solution
values u(ti , xj ) at the interior nodes. Note that the system (13.159) is a second order
iterative scheme, since computing the (i + 1)st iterate u(i+1) requires the value of the
preceding two iterates u(i) and u(i1) .
The one difficulty is getting the method started. We know u(0) since u0,j = fj = f (xj )
is determined by the initial position. However, we also need to find u (1) with entries
u1,j u(k, xj ) at time t1 = k in order to get off the ground, but the initial velocity
ut (0, x) = g(x) prescribes the derivatives ut (0, xj ) = g(xj ) = gj at time t0 = 0 instead.
One way to approach this would be to use the finite difference approximation
u(k, xj ) u(0, xj )
u1,j gj
u
(0, xj )

t
k
k
to compute the required values
u1,j = fj + k gj .
gj =

(13.161)

However, the approximation (13.161) is only accurate to order k, whereas the rest of the
scheme has error proportional to k 2 . Therefore, we would introduce a significantly larger
error at the initial step, and the resulting solution would not have the desired order of
accuracy.
In order to compute an initial approximation to u(1) with error on the order of k 2 , we
need to analyze the local error in more details. Note that, by Taylors theorem,
u(k, xj ) u(0, xj )
u
k 2u
u
c2 k 2 u

(0, xj ) +
(0,
x
)
=
(0,
x
)
+
(0, xj ) ,
j
j
k
t
2 t2
t
2 x2
where the error is now of order k 2 , and we have used the fact that u is a solution to the
wave equation. Therefore, we find
u
c2 k 2 2 u
(0, xj )
(0, xj ) +
t
2 x2
c2 k 2
c2 k 2 00
f (xj ) fj + k gj +
(f
2 fj + fj1 ) ,
= f (xj ) + k g(xj ) +
2
2 h2 j+1

u(k, xj ) u(0, xj ) + k

where we can use the finite difference approximation (13.130) for the second derivative of
f (x) if no explicit formula is known. Therefore, we can initiate the scheme by setting
u1,j =

1
2

2 fj+1 + (1 2 )fj +

1
2

2 fj1 + k gj ,

(13.162)

maintain order k 2 (and h2 ) accuracy.


Example 13.15. Consider the particular initial value problem
The stability analysis of the numerical scheme proceeds as follows. We first need to
recast the second order iterative system (13.159) into a first order system. As in Exercise
, this is accomplished by introducing the vector
(i)
u
(i)
R 2n2 .
z =
u(i1)
3/7/03

584

c 2003

Peter J. Olver

The Courant Condition.

Figure 13.15.
Then
z

(i+1)

=Cz

(i)

(i)

+c ,

where

C=

B
I

I
O

(13.163)

Therefore, the stability of the method will be determined by the eigenvalues of the coefficient matrix C. The eigenvector equation C z = z, can be written out in components

u
B u v = u,
u = v,
where
z=
.
v
Substituting the second equation into the first, we find
2

( B 1) v = 0,

or

Bv =

1
+

v.

The latter equation means that + 1 is an eigenvalue of B and v the corresponding


eigenvector. There is a straightforward connection between b and the matrix B, (13.144),
with parameter = 2 appearing the the numerical scheme for the heat equation. Using
Lemma 13.13 and Exercise , the eigenvalues of B are given by
+

k
1
= 1 + 2 2 sin2
,

k = 1, . . . , n 1.

Fixing k for the moment, we rewrite the eigenvalue equation in the form
2 2 ak + 1 = 0,

where

ak = 2 sin2

k
.
n

Each pair of solutions to this quadratic equation,


q

k = ak a2k 1 ,

(13.164)

gives two eigenvalues of the matrix C. If ak > 1, then one of the two eigenvalues will be
larger than one in magnitude, and hence the linear iterative system will have an exponentially growing mode, and hence k u(i) k as i for almost all choices of initial
data. This is clearly incompatible with the wave equation solution that we are trying to
approximate, which is periodic and hece remains bounded.
On the other hand, if | ak | < 1, then the eigenvalues (13.164) are complex numbers
of modulus 1, indicated stability (but not convergence) of the matrix C. Therefore, we
3/7/03

585

c 2003

Peter J. Olver

should require that all ]a1 , . . . , an1 are less than 1 in magnitude, which is guaranteed
provided
ck
h
=
< 1,
or
k< .
(13.165)
h
c
This places a restriction on the relative sizes of the time and space steps, and hence the
numerical scheme is conditionally stable.
The stability criterion (13.165) is known as the Courant condition, and can be assigned a simple geometric interpretation. Recall that the wave speed c is the slope of the
characteristic lines for the wave equation. The Courant condition requires that the mesh
slope, which is defined to be the ratio of the space step size to the time step size, namely
h/k, must be strictly greater than the characteristic slope c. This implies that a signal
starting at a mesh point (ti , xj ) will reach positions xj k/c at the next time ti+1 = ti + k,
which are still between the mesh points xj1 and xj+1 . Thus, characteristic lines that
start at a mesh point are not allowed to reach beyond the neighboring mesh points at the
next time step.
For instance, in Figure 13.15, the wave speed is c = 1.25. The first figure has equal
mesh spacing k = h, and does not satisfy the Courant condition (13.165), whereas the
second figure has k = 21 h, which does. Note how the characteristic lines starting at a
given mesh point have progressed beyond the neighboring mesh points after one time step
in the first case, but not in the second.

3/7/03

586

c 2003

Peter J. Olver

Chapter 14
The Laplace Equation
The fundamental partial differential equations that govern the equilibrium mechanics
of multi-dimensional media are the Laplace equation and its inhomogeneous counterpart,
the Poisson equation. The Laplace equation is arguably the most important differential
equation in all of applied mathematics. It arises in an astonishing variety of mathematical
and physical systems, ranging through fluid mechanics, electromagnetism, potential theory, solid mechanics, heat conduction, geometry, probability, number theory, and on and
on. The solutions to the Laplace equation are known as harmonic functions, and the
discovery of their many remarkable properties forms one of the most significant chapters
in the history of mathematics.
In this chapter, we concentrate on the Laplace and Poisson equations in a two-dimensional (planar) domain. Their status as equilibrium equations implies that the solutions
are determined by their values on the boundary of the domain. As in the one-dimensional
equilibrium boundary value problems, the principal cases are Dirichlet or fixed, Neumann
or free, and mixed boundary conditions arise. In the introductory section, we shall briefly
survey the basic boundary value problems associated with the Laplace and Poisson equations. We also take the opportunity to summarize the crucially important tripartite classification of planar second order partial differential equations: elliptic, such as the Laplace
equation; parabolic, such as the heat equation; and hyperbolic, such as the wave equation.
Each species has quite distinct properties, both analytical and numerical, and each forms
an essentially distinct discipline. Thus, by the conclusion of this chapter, you will have
encountered all three of the most important genres of partial differential equations.
The most important general purpose method for constructing explicit solutions of
linear partial differential equations is the method of separation of variables. The method
will be applied to the Laplace and Poisson equations in the two most important coordinate
systems rectangular and polar. Linearity implies that we may combine the separable
solutions, and the resulting infinite series expressions will play a similar role as for the
heat and wave equations. In the polar coordinate case, we can, in fact, sum the infinite
series in closed form, leading to the explicit Poisson integral formula for the solution. More
sophisticated techniques, relying on complex analysis, but (unfortunately) only applicable
to the two-dimensional case, will be deferred until Chapter 15.
Greens formula allows us to properly formulate the Laplace and Poisson equations in
self-adjoint, positive definite form, and thereby characterize the solutions via a minimization principle, first proposed by the nineteenth century mathematician Lejeune Dirichlet,
who also played a crucial role in putting Fourier analysis on a rigorous foundation. Minimization forms the basis of the most important numerical solution technique the finite
3/7/03

587

c 2003

Peter J. Olver

element method that we first encountered in Chapter 10. In the final section, we discuss
numerical solution techniques based on finite element analysis for the Laplace and Poisson
equations and their elliptic cousins, including the Helmholtz equation and more general
positive definite boundary value problems.

14.1. The Laplace Equation in the Plane.


The two-dimensional Laplace equation is the second order linear partial differential
equation
2u 2u
+ 2 = 0.
(14.1)
x2
y
Along with the heat and wave equations, it completes the trinity of truly fundamental
partial differential equations. A real-valued solution u(x, y) to the Laplace equation is
known as a harmonic function. The space of harmonic functions can thus be identified as
the kernel of the second order linear partial differential operator
=

2
2
+
,
x2
y 2

(14.2)

known as the Laplace operator , or Laplacian for short. The inhomogeneous or forced
version, namely
2u 2u
2 = f (x, y)
(14.3)
[ u ] =
x2
y
is known as Poissons equation. It forms the two-dimensional analogue of the basic equilibrium equation (10.12) for a bar, with the overall minus sign playing an analogous role.
The Laplace and Poisson equations arise as the basic equilibrium equations in a remarkable variety of physical systems. For example, we may interpret u(x, y) as the displacement of a membrane, e.g., a drum skin. The inhomogeneity f (x, y) in the Poisson
equation represents an external forcing of the membrane. Another example is in the thermal equilibrium of planar bodies; here u(x, y) represents the temperature and f (x, y) an
external heat source. In fluid mechanics and electrostatics, u(x, y) represents the potential
function whose gradient u generates the corresponding flow; see below for details. The
dynamical counterparts to the Laplace equation are multi-dimensional versions of the heat
and wave equations, to be analyzed in Chapter 16.
Since both the Laplace and Poisson equations describe equilibria, they arise in most
physical situations in the context of boundary value problems. We seek a solution u(x, y) to
the partial differential equation defined on a fixed bounded, open domain (x, y) R 2 .
The solution will be required to satisfy suitable conditions on the boundary of the domain,
denoted , which will consist of one or more simple, closed curves, as illustrated in

See Chapter A for the precise definitions of the terms domain, bounded, boundary,

etc.

3/7/03

588

c 2003

Peter J. Olver

Figure 14.1.

Figure 14.2.

Planar Domain.

Dirichlet Boundary Conditions.

Figure 14.1. As in the one-dimensional case, there are several important types of boundary
conditions.
The first are the fixed or Dirichlet boundary conditions, which specify the value of the
function u on the boundary:
u(x, y) = h(x, y)

for

(x, y) .

(14.4)

Under reasonable conditions on the type of domain, the Dirichlet conditions (14.4) serve to
uniquely specify the solution u(x, y) to the Laplace or Poisson equation. Physically, in the
case of a free or forced membrane, the Dirichlet boundary conditions correspond to gluing
the edge of the membrane to a wire at height h(x, y) over each boundary point (x, y) ,
as illustrated in Figure 14.2. Uniqueness means that the shape of the boundary wire will
uniquely specify the vertical displacement of the membrane in equilibrium. Similarly,
in the modeling of thermal equilibrium, a Dirichlet boundary condition represents the
imposition of a prescribed temperature distribution, represented by the function h, along
the boundary of the plate.
3/7/03

589

c 2003

Peter J. Olver

The second type of boundary conditions are the Neumann boundary conditions
u
= u n = k(x, y)
n

on

(14.5)

in which the normal derivative of the solution u on the boundary is prescribed. For
example, in thermal equilibrium, a Neumann boundary condition specifies the heat flux into
the domain through its boundary. The most important are the no-flux or homogeneous
Neumann boundary conditions, where k(x, y) 0. In thermomechanics, this corresponds
to an insulated boundary. In the case of a membrane, it corresponds to the edge of the
drum being left free. In fluid mechanics, where u represents the fluid potential, the noflux conditions imply that the normal component of the velocity vector vanishes, and so
corresponds to a solid boundary that does not allow the fluid to flow across it.
Finally, one can mix the boundary conditions, imposing Dirichlet conditions on part
of the boundary, and Neumann on the complementary part. The general mixed boundary
value problem has the form
u = f

in ,

u = h on

D,

u
=k
n

on N,

(14.6)

with the boundary = D N being the disjoint union of a Dirichlet part, denoted
by D, and a Neumann part N . For example, in heat conduction, if we want to find the
equilibrium temperature distribution over a planar body, the Dirichlet part of the boundary
is where we fix the temperature, while the Neumann part is insulated, or, more generally,
has prescribed heat flux. Similarly, for displacement of a membrane, the Dirichlet part is
where the edge of the drum is attached to a support, while the homogeneous Neumann
part is where it is left hanging free.
Classification of Linear Partial Differential Equations in the Plane
We have, at last, encountered all three of the fundamental linear, second order, partial
differential equations for functions of two variables. The homogeneous versions of the
trinity are
a) The wave equation:

utt c2 uxx = 0,

hyperbolic,

b) The heat equation:


c) Laplaces equation:

ut uxx = 0,
uxx + uyy = 0,

parabolic,
elliptic.

The last column is the equations type, according to a general taxonomy of partial
differential equations. An explanation of the choice of terminology will appear later.
The wave, heat and Laplace equations are the prototypical representatives of the three
most important genres of partial differential equations. The student should understand
that there are fundamental differences. Equations governing vibrations, such as the wave
equation, are typically hyperbolic. Equations governing diffusion, such as the heat equation, are parabolic. Hyperbolic and parabolic equations are dynamical processes, and one
of the variables is identified with the time. On the other hand, equations of equilibrium,
including the Laplace and Poisson equations, are typically elliptic, and only involve spatial
3/7/03

590

c 2003

Peter J. Olver

variables. Elliptic partial differential equations are associated with boundary value problems, whereas parabolic and hyperbolic equations lead to initial-boundary value problems,
with, respectively, one or two required initial conditions. Furthermore, numerical solution
techniques and requirements are of a fundamentally different character in all three cases.
While the initial tripartite classification is most evident in partial differential equations
in two variables, the terminology and underlying properties of these three fundamental
genres carries over to equations in higher dimensions. Most of the important partial
differential equations arising in applications appear in one of these three general classes,
and it is fair to say that the field of partial differential equations breaks into three major,
disjoint subfields. Or, rather four subfields, the last being all the equations, including
higher order equations, that do not fit into this neat categorization, which is, of course,
yet further subdivided into a variety of subspecies.
The classification of linear, second order partial differential equations for a scalarvalued function u(x, y) of two variables proceeds in the following manner. The most
general such equation has the form
L[ u ] = A uxx + B uxy + C uyy + D ux + E uy + F u = f,

(14.7)

where the coefficients A, B, C, D, E, F are all allowed to be functions of (x, y), as is the
inhomogeneity or forcing function f = f (x, y). The equation is homogeneous if and only
if f 0. We assume that at least one of the leading coefficients A, B, C is nonzero, as
otherwise the equation is of first order.
The key quantity that determines the type of such a partial differential equation is its
discriminant
(x, y) = B 2 4 A C.
(14.8)
This should (and for good reason) remind the reader of the discriminant of the quadratic
equation
Q(, ) = A 2 + B + C 2 + D + E + F = 0.
(14.9)
The set of solutions (, ) to such an equation describes a curve; namely, a conic section.
In the nondegenerate cases, the discriminant = B 2 4 A C determines its geometrical
type; it is an ellipse when > 0, a parabola when = 0, or a hyperbola when < 0.
This tripartite classification provides the underlying motivation for the terminology used
to classify second order partial differential equations.
Definition 14.1. A linear, second order partial differential equation (14.7) at a point
(x, y) is called
a) elliptic
(x, y) < 0,
b) parabolic
if
(x, y) = 0,
c) hyperbolic

(x, y) > 0.

For dynamical equations, we will identify y as the time variable t.

3/7/03

591

c 2003

Peter J. Olver

In particular, the wave equation uxx uyy = 0 has discriminant = 4, and is


hyperbolic. The heat equation uxx uy = 0 has discriminant = 0, and is parabolic.
Finally, the Poisson equation uxx + uyy = f has discriminant = 4, and is elliptic.
Example 14.2. Since the coefficients in the partial differential equation are allowed
to vary over the domain, the type of an equation can vary from point to point. Equations
that change type are much less common, as well as being much harder to handle. One
example arising in the theory of supersonic aerodynamics is the Tricomi equation
y uxx uyy = 0.

(14.10)

Comparing with (14.7), we find that A = y, C = 1 and B = D = E = F = f = 0. The


discriminant in this particular case is = 4 y, and hence the equation is hyperbolic when
y > 0, elliptic when y < 0, and parabolic on the transition line y = 0. The hyperbolic
region corresponds to subsonic fluid flow, while the supersonic regions are of elliptic type.
The transitional parabolic boundary represents the shock line between the sub- and supersonic regions.
Characteristics
Certain curves play a distinguished role in the analysis of second order, linear partial
differential equations. A smooth curve x(t) is called a characteristic curve for the second
T

6= 0 satisfies the
order partial differential equation (14.7) if its tangent vector x = x y
quadratic characteristic equation

A(x, y) y 2 B(x, y) x y + C(x, y) x2 = 0.

(14.11)

Pay careful attention to the form of the characteristic equation; in particular, the first and
zeroth order terms in the original partial differential equation play no role.
For example, consider the wave equation
uyy c2 uxx = 0.
In this case, A = c2 , B = 0, C = 1, and so (14.11) takes the form

x2 c2 y 2 = 0,

and so

x = c y.

The solutions to the resulting ordinary differential equation are


x(t) = c y(t) + k,

(14.12)

where k is an integration constant. Therefore, the wave equation has two characteristic
curves passing through each point (a, b), namely the straight lines (14.12) of slope 1/c.
Thus, the general definition of characteristic curve is in accordance with our earlier definition (13.118) of the characteristic lines for the wave equation. (In our earlier discussion,

Here we are using y as the time variable, rather than t, which is now playing the role of
the curve parameter.

3/7/03

592

c 2003

Peter J. Olver

the geometrical roles of the x and y = t variables were reversed, which is why we now find
the reciprocal characteristic value of the slope.)
On the other hand, the Laplace equation
uxx + uyy = 0
has no (real) characteristic curves since the characteristic equation (14.11) reduces to

x2 + y 2 = 0. Finally, for the heat equation


uxx uy = 0,

the characteristic equation is simply y 2 = 0, and so there is only one characteristic curve
through each point (a, b), namely the horizontal line y = b. In this manner, one distinguishes elliptic, parabolic, and hyperbolic partial differential equations by the number of
(real) characteristic curves passing through a point namely, zero, one and two, respectively. Further discussion of characteristics for nonlinear partial differential equations can
be found in Section 21.1.
Some general remarks on the role of characteristic curves follow, albeit without proof.
As with the wave equation, signals and localized waves in a partial differential equation
tend to propagate along the characteristic curves. This fact lies at the foundation of
geometric optics. Light rays move along characteristic curves, and are thereby subject
to the optical phenomena of refraction and focusing. Similarly, since the characteristic
curves for the heat equation are the horizontal lines parallel to the x axis, the signals
propagate instantaneously, in accordance with our observation that the effect on an initial
concentrated heat source is immediately felt all along the bar. Finally, elliptic equations
have no characteristics, and as a consequence, do not admit propagating signals; the effect
of a localized disturbance, say on a membrane, is immediately felt everywhere.

14.2. Separation of Variables.


One of the earliest and still most widely used techniques for constructing explicit
analytical solutions to partial differential equations is the method of separation of variables.
We have, in fact, already applied the separation of variables method to construct particular
solutions to the heat and wave equations. In each case, we looked for a solution in the
form of a product u(t, x) = h(t) v(x). In the general separation of variables method, one
does not know either factor in advance. If the method succeeds (which is not guaranteed),
both will be determined as solutions to associated ordinary differential equations.
For the Laplace equation, the solution depends on x and y, and so the separation of
variables ansatz becomes
u(x, y) = v(x) w(y).
(14.13)
Let us substitute this expression into the Laplace equation. First of all,
2u
= v 00 (x) w(y),
x2
3/7/03

593

2u
= v(x) w 00 (y),
y 2
c 2003

Peter J. Olver

where the primes indicate ordinary derivatives, and so


u =

2u 2u
+ 2 = v 00 (x) w(y) + v(x) w 00 (y) = 0.
x2
y

The method of separation of variables will succeed if we are able to manipulate the resulting
equation so as to place all of the terms involving x on one side of the equation and all the
terms involving y on the other. Here, we first write
v 00 (x) w(y) = v(x) w 00 (y).
Dividing both sides by v(x) w(y) (which we assume is not identically zero as otherwise the
solution would be trivial) yields
v 00 (x)
w00 (y)
=
,
v(x)
w(y)

(14.14)

and effectively separates the x and y variables on each side of the equation. Now, how
could a function of x alone be equal to a function of y alone? A moments reflection should
convince the reader that this can happen if and only if the two functions are constant . We
use , known as the separation constant, to designate this common value. Then (14.14)
reduces to a pair of ordinary differential equations
v 00 v = 0,

w 00 + w = 0,

for the individual factors v(x) and w(y). We already know how to solve both of these
ordinary differential equations by elementary techniques. There are three different cases,
depending on the sign of the separation constant . Each case leads to four different
solutions, and we collect the entire family of separable solutions together in the following
table.
Separable Solutions to Laplaces Equation

v(x)
2

w(y)
e

, e

u(x, y) = v(x) w(y)


y

= < 0

cos x, sin x

=0

1, x

1, y

= 2 > 0

e x , e x

cos y, sin y

e y cos x, e y sin x,
e y cos x, e y sin x
1, x, y, x y
e x cos y,

e x sin y,

e x cos y, e x sin y

Technical detail: one should assume that the underlying domain is connected for this to be
true; however, in practical analysis, this technicality is irrelevant.

3/7/03

594

c 2003

Peter J. Olver

Since Laplaces equation is linear, we can utilize superposition to combine these types
of solutions together, either as finite linear combinations, or, provided we pay proper
attention to convergence issues, as infinite series.
For boundary value problems, the applicability of such separable solutions imposes
fairly severe restrictions on the geometry of the domain. The ansatz (14.13) effectively
requires that the domain be rectangular. Thus, we are led to consider the boundary value
problem for Laplaces equation
u = 0

on a rectangle

R = { 0 < x < a, 0 < y < b }.

(14.15)

To illustrate the method, consider the following Dirichlet boundary conditions


u(x, 0) = f (x),

u(x, b) = 0,

u(0, y) = 0,

u(a, y) = 0.

(14.16)

We are only allowing a nonzero boundary condition on one of the four sides of the rectangle,
in order to simplify the analysis. The Dirichlet boundary value problem can then be solved
by adding together the solutions to the four boundary value problems which only have
nonzero boundary conditions on one side of the rectangle; see Exercise .
Of the variety of solutions available through separation of variables, the only ones
that will play a role are those that respect the boundary conditions. Putting the nonzero
boundary condition aside for the moment, we ask that u(x, y) = v(x) w(y) be zero on the
top, right and left sides of the rectangle. This requires
v(0) = v(a) = 0,

w(b) = 0.

Referring to the above table, the first condition

sin x,
v(x) =
x,

sinh x,

v(0) = 0 requires
= 2 > 0,
= 0,
= 2 < 0,

where sinh z = 12 (ez ez ) is the usual hyperbolic sine function. The second and third
cases cannot satisfy the second boundary condition v(a) = 0, and so we discard them. The
first case leads to the condition
v(a) = sin a = 0,

and hence

a = , 2 , 3 , . . . .

Therefore, the separation constant has the form


= 2 =

n2 2
,
a2

(14.17)

with the corresponding solutions


n x
,
n = 1, 2, 3, . . . .
(14.18)
a
Remark : We have just recomputed the known eigenvalues and eigenfunctions of the
familiar boundary value problem
v(x) = sin

v 00 + v = 0,
3/7/03

v(0) = v(a) = 0.
595

c 2003

Peter J. Olver

The third boundary condition w(b) = T 0 requires that, up to constant multiple,


w(y) = sinh (b y) = sinh

n (b y)
.
a

(14.19)

Therefore, each of the separable solutions


n x
n (b y)
(14.20)
sinh
,
n = 1, 2, 3, . . . ,
a
a
satisfies the three homogeneous boundary conditions. It remains to analyze the boundary
condition along the bottom edge of the rectangle. We try a linear superposition of the
separable solutions in the form of an infinite series
un (x, y) = sin

u(x, y) =

cn un (x, y) =

n=1

n=1

cn sin

n x
n (b y)
sinh
,
a
a

where the coefficients c1 , c2 , . . . are to be determined by the remaining inhomogeneous


boundary condition. At the bottom edge y = 0 we find
u(x, 0) =

cn sinh

n=1

n x
n b
sin
= f (x),
a
a

(14.21)

which takes the form of a Fourier sine series for the function f (x). According to (11.78),
for the interval 0 < x < a, the coefficients bn of the Fourier sine series
f (x) =

n=1

n x
bn sin
a

are given by

2
bn =
a

f (x) sin
0

n x
dx.
a

(14.22)

Comparing (14.21), (14.22), we see that


bn
.
n b
sinh
a
Therefore, the solution to the boundary value problem takes the form of an infinite series
cn sinh

n b
= bn
a

u(x, y) =

n=1

or

cn =

n (b y)
sinh
n x
a
,
bn sin
n
b
a
sinh
a

(14.23)

where bn are the Fourier


Z asine coefficients (14.22) of f (x). It can be shown, cf. Exercise ,
| f (x) | dx < , then the series solution converges on the entire
that if f is integrable,
0

rectangle R. Moreover, if y > 0, the go to zero exponentially fast, and so the solution can
be well approximated by partial summation. The exponentially fast decay of the Fourier
coefficients implies that u(x, y) is an infinitely differentiable function of x at each y > 0.
In fact, as we shall see, the solutions to the Laplace equation are always analytic functions
inside the domain even when the boundary conditions are quite unsmooth.
3/7/03

596

c 2003

Peter J. Olver

Figure 14.3.

Square Membrane on a Wire.

Example 14.3. A membrane is stretched over a wire in the shape of a unit square
with one side bent in half, as graphed in Figure 14.3. The precise boundary conditions are

x,
0 x 21 ,
y = 0,

y = 0,
1 x,
2 x 1,
u(x, y) =
0,
0 x 1,
y = 1,

0,
x = 0,
0 y 1,

0,
x = 1,
0 y 1.
The Fourier sine series of the inhomogeneous boundary function is readily computed:
(
x,
0 x 21 ,
f (x) =
1
1 x,
2 x 1,

sin(2m + 1) x
sin 3 x sin 5 x
4
4 X
(1)m
= 2 sin x
+
= 2
.

9
25
m=0
(2m + 1)2

Therefore, the solution to the boundary value problem is given by the Fourier series
u(x, y) =

4 X
m sin(2m + 1) x sinh(2m + 1) (1 y)
.
(1)
2 m = 0
(2m + 1)2 sinh(2m + 1)

For y > 0, the series converges rapidly owing to the exponential decay of its terms, and so
can be well approximated by its first few summands. In Figure 14.3 we graph the sum of
the first 10 terms in the series, which is a reasonably good approximation except when we
are very close to the raised corner of the wire, which is the point of maximal displacement
of the membrane. This is indicative of a very general and important fact: a harmonic
function achieves its maximum and minimum values only on the boundary of its domain;
see Corollary 14.8 for details.
Polar Coordinates
The method of separation of variables can be used in certain other very special geometries. One particularly important case is a circular disk. Let us take the disk to have
3/7/03

597

c 2003

Peter J. Olver

radius 1, centered at the origin. Consider the Dirichlet boundary value problem
u = 0,

x2 + y 2 < 1,

and

x2 + y 2 = 1,

u = h,

(14.24)

so that the function u(x, y) satisfies the Laplace equation on the unit disk and has Dirichlet
boundary conditions on the unit circle. For example, u(x, y) might represent the displacement of a circular drum that is attached to a wire of height
h(x, y) = h(cos , sin ) h(),

0 2 ,

(14.25)

above each point (x, y) = (cos , sin ) on the unit circle.


The rectangular separable solutions are not particularly helpful in this situation. The
fact that we are dealing with a circular geometry inspires us to adopt polar coordinates
x = r cos ,

y = r sin ,

or

r=

x2 + y 2 ,

= tan1

y
,
x

and write the solution as a function of r, . We also need to relate derivatives with respect
to x and y to those with respect to r and . Performing a standard chain rule computation,
we find

= cos
+ sin
,
r
x
y

= r sin
+ r cos
,

x
y

so

= cos
x

= sin
y

sin

r
r

cos
+
r
r

(14.26)

These formulae allow us to rewrite the Laplace equation in polar coordinates; after some
calculation in which many of the terms cancel, we find
2u 2u
2 u 1 u
1 2u
+
=
u
=
+
+
= 0.
x2
y 2
r2
r r
r2 2

(14.27)

The boundary conditions are on the unit circle r = 1, and so, by (14.25), take the form
u(1, ) = h().
Note especially that u(r, ) and the boundary value h() are 2 periodic functions in the
angular coordinate:
u(r, + 2 ) = u(r, ),

h( + 2 ) = h().

(14.28)

Polar separation of variables is based on the product ansatz


u(r, ) = v(r) w().

(14.29)

Substituting (14.29) into the polar form (14.27) of Laplaces equation, we find
v 00 (r) w() +
3/7/03

1 0
1
v (r) w() + 2 v(r) w 00 () = 0.
r
r
598

c 2003

Peter J. Olver

We now separate variables by moving all the terms involving r onto one side of the equation
and all the terms involving onto the other. This is accomplished by first rewriting the
equation in the form

1
1 0
00
v (r) + v (r) w() = 2 v(r) w 00 (),
r
r
and then dividing by the product v(r) w(), whence
r2 v 00 (r) + r v 0 (r)
w00 ()
=
= .
v(r)
w()
As in the rectangular case, a function of r can equal a function of if and only if both
are equal to a common separation constant . Therefore, the partial differential equation
reduces to a pair of ordinary differential equations
r2 v 00 + r v 0 r = 0,

w 00 + w = 0,

(14.30)

for the components of the separable solution (14.29). These take the form of eigenfunction
equations in which the separation constant plays the role of the eigenvalue.
We have already solved the eigenvalue problem for w(). According to (14.28), w( +
2 ) = w() must be a 2 periodic eigenfunction. Therefore, the eigenvalues (separation
constants) are = n2 , with associated eigenfunctions
1,

sin n ,

cos n ,

n = 0, 1, 2, . . . .

(14.31)

Using the value = n2 , the remaining ordinary differential equation


r2 v 00 + r v 0 n2 r = 0.

(14.32)

has the form of a second order Euler equation for v(r). As discussed in Example 7.25, the
solutions are obtained by substituting the power ansatz v(r) = r k into the equation. The
resulting characteristic equation requires
k 2 n2 = 0,

and hence

k = n.

Therefore, if n 6= 0, we find two linearly independent solutions,


v1 (r) = r n ,

v2 (r) = r n ,

n = 1, 2, . . . .

(14.33)

n = 0,

(14.34)

If n = 0, we have an additional logarithmic solution


v1 (r) = 1,

v2 (r) = log r,

as in Exercise . Combining (14.31) and (14.33), (14.34), we recover the following separable
polar coordinate solutions to the Laplace equation
1,
log r,

r n cos n ,
r n cos n ,

r n sin n ,
r n sin n ,

n = 1, 2, 3, . . . .

(14.35)

Now, the solutions in the top row of (14.35) are continuous (in fact analytic) at the
origin, whereas the solutions in the bottom row have singularities as r 0. The latter
3/7/03

599

c 2003

Peter J. Olver

are not relevant since we require the solution u(x, y) to remain bounded and smooth
even at the center of the disk. Thus, we should only use the former in concocting a series
solution

a0
u(r, ) =
+
an rn cos n + bn rn sin n .
(14.36)
2
n=1
At the boundary r = 1, we must have

Therefore,

a0
+
an cos n + bn sin n = h().
u(1, ) =
2
n=1

1
an =

1
bn =

h() cos n d,

h() sin n d,

(14.37)

are precisely the Fourier coefficients (11.23) of the boundary value function h().
Remark : Introducing the complex variable z = r e i = x + i y allows us to write
z n = rn e i n = rn cos n + i r n sin n .

(14.38)

Therefore, the separable solutions for n 1 are nothing but the harmonic polynomial
solutions derived in Example 7.42, namely
rn cos n = Re z n ,

rn sin n = Im z n .

(14.39)

Exploitation of the remarkable connections between the solutions to the Laplace equation
and complex functions will form the focus of Chapter 15.
Example 14.4. Consider the boundary value problem on the unit disk with
u(1, ) =

for

< < .

(14.40)

The boundary data can be interpreted as attaching a circular membrane to a wire in the
shape of a single turn of a spiral helix bent over the unit circle. The wire has a jump
discontinuity at (1, 0). The Fourier series for h() = was computed in Example 11.2,
namely

sin 2 sin 3 sin 4


+

+ .
h() = 2 sin
2
3
4
Therefore, the solution to the Laplace equation with these boundary conditions is

r2 sin 2 r3 sin 3 r4 sin 4


u(r, ) = 2 r sin
+

+ .
(14.41)
2
3
4
In fact, this series can be explicitly summed. Using (14.38), we find

z2
z3
z4
u(x, y) = 2 Im z
+

+ = 2 Im log(1 + z)
2
3
4
3/7/03

600

c 2003

Peter J. Olver

Figure 14.4.

Membrane Attached to Helical Wire.

is twice the imaginary part of the Taylor series for log(1 + z). If we write 1 + z = e i =
exp(log + i ), then the solution (14.41) is given in the explicit form
u(x, y) = 2 = 2 tan1

y
,
1+x

(14.42)

and is plotted in Figure 14.4. The quantity is the angle that the line passing through
the two points (x, y) and (1, 0) makes with the x-axis, as in Figure a2 . You should
try to convince yourself that, on the unit circle, 2 = has the correct boundary values!
Moreover, even though the boundary values are discontinuous, the solution is an analytic
function inside the disk.
Unlike the rectangular series solution, the general Fourier series solution (14.36) for
a disk can, in fact, be summed in closed form! If we substitute the Fourier formulae
(14.37) into (14.36) remembering to change the integration variable to, say, to avoid
a notational conflict we find

!
Z

1 X n
1
h()
+
u(r, ) =
r cos n cos n + sin n sin n
d

2 n=1
!

(14.43)
Z

1 X n
1
r cos n ( ) d.
h()
+
=

2 n=1
We next show how to sum the series in brackets. Using (14.38), we can write it as the real
part of a geometric series:
!

1 X n
1
z
1+z
1 X n
= Re
+
+
+
= Re
r cos n = Re
z
2 n=1
2 n=1
2 1z
2(1 z)

Re (1 + z z | z |2 )
1 | z |2
1 r2
(1 + z)(1 z)
=
= Re
=
=
.
2 | 1 z |2
2 | 1 z |2
2 | 1 z |2
2 (1 + r 2 2 r cos )
3/7/03

601

c 2003

Peter J. Olver

The Poisson Kernel.

Figure 14.5.

Substituting this formula back into (14.43), we have deduced the important Poisson Integral Formula for the solution to the boundary value problem, named after the French
mathematician SimeonDenis Poisson.
Theorem 14.5. The solution u(r, ) to the Laplace equation in the unit disk with
Dirichlet boundary conditions u(1, ) = h() is
Z
1
1 r2
d.
u(r, ) =
h()
(14.44)
2
1 + r 2 2 r cos( )
Example 14.6. A particularly important case is when the boundary value
h() = ( )
is a delta function concentrated at the point (cos , sin ) on the unit circle. The solution
to the resulting boundary value problem is the Poisson integral kernel
u(r, ) =

1 r2
1 | z |2
=
.
2 | 1 z |2
2 1 + r 2 2 r cos( )

(14.45)

The reader may enjoy verifying that this function does indeed, solve the Laplace equation
and has the correct boundary values in the limit as r 1. Physically, if u(r, ) represents
the equilibrium temperature of the disk, then the delta function boundary values correspond to a unit concentrated heat source being applied at a single point on the boundary.
The solution is sketched in Figure 14.5. The general Poisson integral formula (14.44) results from our general superposition principle, based on the fact that general boundary
data can be written as a superposition,
Z
h() ( ) d,
h() =

of delta functions.
3/7/03

602

c 2003

Peter J. Olver

If we set r = 0 in the Poisson formula (14.44), then we obtain


Z
1
h() d.
u(0, ) =
2

(14.46)

The left hand side is the value of u at the origin; the right hand side is the average of its
boundary values around the unit circle. This is a particular case of an important general
fact.
Theorem 14.7. The value of a harmonic function u at a point (x0 , y0 ) is equal to
the average of its values on any circle centered at the point:
I
Z 2
1
1
u(x0 + r cos , y0 + r sin ) d.
(14.47)
u(x0 , y0 ) =
u ds =
2 r C
2 0
The result requires that u be harmonic on the entire closed disk bounded by this circle.
Proof : We use a scaling and translation to map the disk of radius r centered at (x 0 , y0 )
to the unit disk centered at the origin. Specifically, we set
U (x, y) = u(x0 + r x, y0 + r y).

(14.48)

An easy chain rule computation proves that U (x, y) is harmonic on the unit disk, with
boundary values
h() = U (cos , sin ) = u(x0 + r cos , y0 + r sin ).
Therefore, by (14.46) ,
1
U (0, 0) =
2

1
h() d =
2

U (cos , sin ) d.

Replacing U by its formula (14.48) produces the desired result.

Q.E.D.

An important consequence of the integral formula (14.47) is the Maximum Principle


for harmonic functions.
Corollary 14.8. If u(x, y) is a nonconstant harmonic function defined on a domain
, then u does not have a local maximum or local minimum at any interior point of .
Proof : The average of a real function lies strictly between its maximum and minimum
values (except in the trivial case when the function is constant). Theorem 14.7 therefore
implies that u(x, y) lies strictly between its maximal and minimal values on any small
circle centered at (x, y). But if u(x, y) had a local maximum (minimum), then it would
be larger (smaller) than its values at all nearby points, and, in particular, its values on a
small circle around the point. This contradiction proves the theorem.
Q.E.D.
As a consequence, harmonic functions achieve their maxima and minima only at
boundary points of a domain. Any interior critical point, where u = 0, must be a
saddle point. Physically, if we interpret u(x, y) as the vertical displacement of an unforced
membrane, then Corollary 14.8 says that the membrane cannot have any internal bumps
3/7/03

603

c 2003

Peter J. Olver

its highest and lowest points are necessarily on the boundary of the domain. This
reconfirms our physical intuition: the restoring force exerted by the stretched membrane
will serve to flatten any bump, and hence a membrane with a local maximum or minimum
cannot be in equilibrium. A similar interpretation holds for heat conduction. A body
in thermal equilibrium can achieve its maximum and minimum temperature only on the
boundary of the domain. Again, physically, heat energy would flow away from any internal
maximum, or towards any local minimum, and so if the body contained a local maximum
or minimum on its interior, it could not be in thermal equilibrium.
This concludes our discussion of the method of separation of variables and series
solutions to the planar Laplace equation. The method of separation of variables does
apply in a few other special coordinate systems. See Exercise for one example, and
[87, 90, 92] for a complete account, including connections with underlying symmetries of
the equation.

14.3. The Greens Function.


Now we turn to the Poisson equation (14.3), which is the inhomogeneous form of the
Laplace equation. In Section 10.2, we learned how to solve one-dimensional boundary
value problems by use of the Greens function. This important technique can be adapted
to solve inhomogeneous boundary value problems for elliptic partial differential equations
in higher dimensions, including Poissons equation. As in the one-dimensional situation,
the Greens function is the solution to the homogeneous boundary value problem in which
the inhomogeneity is a concentrated unit impulse a delta function. The solution to the
general forced boundary value problem is then obtained via linear superposition, that is,
as a convolution integral with the Greens function.
The first order of business is to establish the proper form for the delta function or
unit impulse in our two-dimensional situation. We denote the delta function concentrated
at position = (, ) R 2 by
(x) = (,) (x, y) = (x ).

(14.49)

The delta function can be viewed as the limit, as n , of a sequence of more and more
highly concentrated unit sources fn (x, y), which are required to satisfy
ZZ
lim fn (x, y) = 0, for (x, y) 6= (0, 0),
while
fn (x, y) dx dy = 1.
n

A good example of a suitable sequence are the radial Gaussian distributions


2

e (x +y
fn (x, y) =
n

)/n

(14.50)

which relies on the fact that


ZZ
3/7/03

R2

e (x

+y 2 )/n

604

dx dy = n ,
c 2003

Peter J. Olver

Gaussian Distributions Converging to the Delta Function.

Figure 14.6.

established in Exercise . Note in Figure 14.6 how the Gaussian profiles become more
and more concentrated at the origin, while maintaining a unit volume underneathe their
graphs.
Alternatively, one can assign the delta function a dual interpretation as a linear functional on the vector space of continuous scalar-valued functions. We formally prescribe the
delta function by the integral formula

ZZ
f (, ),
(, ) ,
h (,) ; f i =
(,) (x, y)f (x, y) dx dy =
(14.51)
0,
(, ) 6 ,

for any continuous function f (x, y) and any domain R 2 . As in the one-dimensional
situation, we will avoid defining the integral when the delta function is concentrated at a
boundary point (, ) of the domain of integration.
Since double integrals can be evaluated as repeated one-dimensional integrals, cf. (A.48),
we can conveniently view
(,) (x, y) = (x) (y) = (x ) (y )

(14.52)

as the product of a pair of one-dimensional delta functions. Indeed, if the domain

= R = a < x < b, c < y < d

is a rectangle, then
ZZ

(,) (x, y)f (x, y) dx dy =


=

b
a
b

(x ) (y ) f (x, y) dy dx
a

(x ) f (x, ) dx = f (, ),
a

provided a < < b and c < < d, i.e., (, ) R; otherwise, for (, ) 6 R, the integral is
0, in accordance with (14.51).
To find the Greens function, we must solve the equilibrium equation subject to a
concentrated unit delta force at a prescribed point = (, ) in the domain. In the
case of Poissons equation, this takes the form
u = ,
3/7/03

or

2 u u

= (x ) (y ),
x2
y
605

(x, y) ,
c 2003

(14.53)

Peter J. Olver

along with homogeneous boundary conditions either Dirichlet or mixed. (The nonuniqueness of solutions to the pure Neumann boundary value problem precludes the existence
of a Greens function.) The resulting solution to the particular Poisson boundary value
problem is denoted
G(x; ) = G(x, y; , ),
(14.54)
and called the Greens function associated with the given boundary value problem. For
each fixed value of = (, ), the function (14.54) measures the effect, at position x = (x, y)
of a concentrated force applied at position = (, ).
Once we know the Greens function, the solution to the general Poisson boundary
value problem
u = f

in

u=0

on

(14.55)

is reconstructed by using a superposition principle. We regard the forcing function


ZZ
f (x, y) =
(x ) (y )f (, ) d d

as a linear combination of delta impulses, whose strength at each point equals the value
of f . Linearity implies that the solution to the boundary value problem is the same
combination of Greens function responses to each of the constituent impulses. The net
result is the fundamental superposition formula
ZZ
u(x, y) =
G(x, y; , ) f (, ) d d
(14.56)

for the solution to the general inhomogeneous boundary value problem.


As in the one-dimensional situation, self-adjointness of the boundary value problem
will imply that the Greens function is symmetric under interchange of its arguments:
G(, ; x, y) = G(x, y; , ).

(14.57)

(The proof of this fact is not hard, but will not be given in detail here.) Symmetry has the
following interesting physical interpretation: Let (x, y), (, ) be any pair of points in
the domain. If we apply a unit impulse at the first point, and measure the effect at the
second, the result is exactly the same as if we apply the impulse at the second point, and
measure the effect at the first! The reader may wish to reflect on whether this is physically
plausible: if we push a membrane, of arbitrary shape, with unit force concentrated at
and measure the deflection at position x the result is the same as if we apply our force at
position x and measure the deflection at . (The deflections at other points in the domain
will typically bear very little connection with each other.) Similarly, in electrostatics, the
solution u(x, y) is interpreted as the electrostatic potential for a system in equilibrium. A
delta function corresponds to a point charge, e.g., an electron. The symmetry property
says that the electrostatic potential at x due to a point charge placed at position is the
same as the potential at due to a point charge at x.
3/7/03

606

c 2003

Peter J. Olver

Unfortunately, most Greens functions, with a few exceptions, cannot be written down
in closed form. However, their fundamental properties can be based on the following
construction. As usual, the general solution to an inhomogeneous linear equation is a sum
u(x, y) = u? (x, y) + z(x, y)

(14.58)

of a particular solution u? and the general solution z to the associated homogeneous equation, namely
z = 0.
Thus, z(x, y) is an arbitrary harmonic function. We shall assume that the particular
solution u? (x, y) is due to the effect of the unit impulse, irrespective of any imposed
boundary conditions. Once we have determined u? , we shall use the freedom inherent
in the harmonic constituent z(x, y) to ensure that the sum (14.58) satisfies the required
boundary conditions.
In order to find a particular solution u? , we may appeal to physical intuition. First,
since the delta function is concentrated at the point , the solution u? must solve the
homogeneous Laplace equation u? = 0 except at the point x = , where we expect
it to have some sort of discontinuity. Second, since the Poisson equation is modeling
a homogeneous, uniform medium (membrane, plate, etc.), in the absence of boundary
conditions, the effect of a unit impulse should only depend upon on the distance away
from the source of the impulse. Therefore, we expect that the desired particular solution
u? = u? (r) will depend only on the radial variable
p
r = k x k = (x )2 + (y )2 .
According to (14.34), the only radially symmetric solutions to the Laplace equation

are
u(r) = a + b log r,

(14.59)

where a and b are constants. The constant term a is smooth and harmonic everywhere,
and so cannot contribute to a delta function singularity. Therefore, our only chance to
produce a particular solution with such a singularity at the point is if we take a multiple
of the logarithmic potential:
u? = b log r.
By construction, this function solves the Laplace equation for r 6= 0, i.e., for x 6= , and
has a singularity at r = 0. But we need to see whether, for some choice of b, it satisfies
the Poisson equation
(b log r) = b log r = (x )
(14.60)
for some choice of the constant b? There are two possible approaches to resolving this
problem, corresponding to the two interpretations of the delta function. One way would
be to approximate the delta function on the right hand side of (14.60) by a limit of highly
concentrated unit sources, e.g., the Gaussian distributions gn (x, y) as given in (14.50). We
then solve the Poisson equation un = gn , and prove that, in the limit, lim un (x, y) =
b log r for a suitable b. The details are worked out in Exercise .
3/7/03

607

c 2003

Peter J. Olver

Alternatively, we interpret both sides of (14.60) as defining linear functionals on the


space of smooth scalar functions f (x, y) by taking the L2 inner product
h b log r ; f i = h ; f i = f (, ),
where we use the defining property (14.51) of the delta function to evaluate the right hand
side. As for the left hand side, since
log r = 0

for all

r > 0,

we only need integrate


h log r ; f i =

ZZ

( log r)f (x, y) dx dy = f (, )


D

ZZ

log r dx dy,
D

over a small disk D = 0 r < = k x k < centered at the singularity x = .


Applying the divergence form (A.57) of Greens Theorem to evaluate the latter integral,
we find
ZZ
ZZ
log r dx dy
log r dx dy =
D
D
I
I
Z
log r
1
=
d = 2 ,
ds =
ds =
n
C

C r
for all > 0. Substituting this result back into (14.60), we find
h log r ; f i = 2 f (, ),

and hence

log r = 2 .

(14.61)

Therefore, the value b = 1/(2 ) leads to our desired formula (14.60), and proves that
the logarithmic potential function

1
1
1
log r =
log k x k =
log (x )2 + (y )2
(14.62)
2
2
4
is a particular solution to the Poisson equation (14.53) with a unit impulse force.
The logarithmic potential function (14.62) represents the gravitational force field in
empty space due to a unit point mass at position , or, equivalently, the electrostatic
potential due to a point charge. It should be emphasized that this is in a two-dimensional
universe; the three-dimensional versions in our physical universe is proportional to 1/r
even when restricted to a two-dimensional plane. See Section 17.1 for further details.
The gravitational or electrostatic potential due to a mass, e.g., a plate, in the shape
of a domain R 2 is given by superimposing delta function sources at each point, whose
strength is the density of the material. The result is the potential
ZZ

1
u(x, y) =
(, ) log (x )2 + (y )2 d d,
(14.63)
4

u? (x, y) =

in which (, ) is the density of the body at position (, ). For example, the gravitational
force due to a disk of radius 1, so D = { x2 + y 2 1 }, is
ZZ

1
u(x, y) =
(, ) log (x )2 + (y )2 d d,
4
D
3/7/03

608

c 2003

Peter J. Olver

which evaluates to
Returning to our boundary value problem, the general solution to (14.53) is, therefore,
u(x, y) =

1
log k x k + z(x, y),
2

(14.64)

where z(x, y) is an arbitrary harmonic function. To construct the Greens function for a
given domain R 2 with prescribed homogeneous boundary conditions on , we need
to choose the harmonic function z(x, y) so that u(x, y), as given in (14.64), satisfies the
boundary conditions. Let us state this result in the case of Dirichlet boundary conditions.
Theorem 14.9. The Greens function for the Dirichlet boundary value problem for
the Poisson equation u = f on , and u = 0 on has the form
G(x, y; , ) =
where
z(x, y) =

1
log (x )2 + (y )2 + z(x, y)
4

1
log (x )2 + (y )2 ,
4

(14.65)

(x, y) ,

is the harmonic function that has the same boundary values as the logarithmic potential
function.
The Method of Images
The preceding analysis uncovers the basic form of the Greens function, but we are still
left with the determination of the harmonic component required to match the logarithmic
potential boundary values. There are three principal techniques used to determine explicit
formulas. The first is an adaptation of the method of separation of variables, and leads to
infinite series expressions, similar to those of the fundamental solution for the heat equation
derived in Chapter 13. We will not dwell on this technique here, although a couple of the
exercises ask the reader to fill in the details. The second method is called the method
of images and will be described in this section. The most powerful method is based on
the theory of conformal mappings, and will be presented in Section 15.3 in the subsequent
chapter. While the first two methods only apply to a fairly limited class of domains,
they do adapt straightforwardly to higher dimensional problems, as well as certain other
types of elliptic partial differential equations, whereas the method of conformal mapping
is, unfortunately, only applicable to two-dimensional problems involving the Laplace and
Poisson equations.
We already know that the singular part of the Greens function for the two-dimensional
Poisson equation is provided by a logarithmic potential. The problem, then, is to construct
the harmonic part, called z(x, y) in (14.65), so that the sum has the correct homogeneous
boundary values, or, equivalently, that z(x, y) has the same boundary values as the logarithmic potential.
In certain cases, z(x, y) can be thought of as the potential induced by one or more
electric charges (or, equivalently, gravitational point masses) that are located outside the
domain , arranged in such a manner that their electrostatic potential coincides with the
3/7/03

609

c 2003

Peter J. Olver

Figure 14.7.

Method of Images for the Unit Disk.

logarithmic potential on the boundary of the domain. The goal, then, is to place the image
charges in the proper positions.
We only consider the simplest case of a single image, located at a position 6 .
We slightly generalize the logarithmic potential (14.62) by allowing an arbitrary scalar
mutliple and also an extra constant:
z(x, y) = a, log k x k + b.
This function is harmonic inside since the logarithmic potential is harmonic everywhere
except at the singularity , which is assumed to lies outside the domain. For the Dirichlet
boundary value problem, then, for each point we require an image point 6 and
constants a, b R, such that
log k x k = a log k x k + b

for all

x .

(14.66)

To simplify the formulas, we have omitted the 1/(2 ) factor, which can easily be reinstated
at the end of the analysis.
In order to make further progress, we make some simplifying assumptions. First, we
assume that a = 1, and so (14.66) can be rewritten as
k x k = k x k,
where = log b. We now use a geometrical construction based upon similar triangles. We
choose = a to be a point lying on the ray through , chosen so that the triangle with
vertices 0, x, is similar to the triangle with vertices 0, , x, noting that they have the
same angle at the common vertex 0, as illustrated in Figure 14.7. Similarity requires that
the triangles sides be in a common ratio, and so
kxk
kx k
kk
=
=
= .
kxk
kk
kx k

(14.67)

Thus, if we choose
kk =
3/7/03

1
,
kk

then
610

k x k2 = k k k k = 1,
c 2003

Peter J. Olver

Figure 14.8.

Greens Function for the Unit Disk.

and hence x lies on the boundary of the unit disk. Given inside the disk, its image point
will be at the reciprocal radius, with
=

.
k k2

(14.68)

The map taking the point to the point defined by (14.68) is known as inversion with
respect to the unit circle. The final equation in (14.67) implies that
kx k = kk kx k =

k k k2 x k
.
kk

Consequently, the functions


k k k2 x k
1
1
log
=
log k x k,
2
kk
2

(14.69)

has the same boundary values on the unit circle k x k = 1. Consequently, their difference
G(x; ) =

1
k k k2 x k
1
k k k2 x k
1
log k x k +
log
=
log
2
2
kk
2
kk k xk

has the required properties of the Greens function for the unit disk. In terms of polar
coordinates
x = (r cos , r sin ),
= ( cos , sin ),
the Law of Cosines leads to the explicit formula

1
1 + r 2 2 2 r cos( )
G(r, ; , ) =
log
.
4
r2 + 2 2 r cos( )

(14.70)

In Figure 14.8 we sketch the Greens function corresponding to a unit impulse being applied
at a point half way between the center and the edge of the disk.
Applying the general superposition formula (14.56), we arrive at a general formula for
the solution to the Dirichlet boundary value problem for the Poisson equation in the unit
disk.
3/7/03

611

c 2003

Peter J. Olver

Theorem 14.10. The solution u(r, ) to the homogeneous Dirichlet boundary value
problem
u = f,
r = k x k < 1,
u = 0,
r=1
is, in polar coordinates,
1
u(r, ) =
4

2
0

f (, ) log
0

1 + r 2 2 2 r cos( )
r2 + 2 2 r cos( )

d d.

The Greens function can also be used to solve the inhomogeneous boundary value
problem.
Theorem 14.11. Let G(x; ) denote the Greens function for the homogeneous
dirichlate boundary value problem for the Poisson equation on a domain R 2 . Then
the solution to the inhomogeneous Dirichlet problem
u = f,

x ,

u = h,

x ,

(14.71)

G(x; )
h() ds.
n

(14.72)

is given by
u(x) = v(x) + (x) =

ZZ

G(x; ) f () d d

Proof : Let (x) be any function such that


= h,

for

x .

Set v = u , so that v satisfies the homogeneous boundary value problem


v = f + ,

x ,

v = 0,

x .

We can therefore express


v(x) =

ZZ

G(x; ) f () + () d d.

The second integral can be simplified using the integration by parts formula (Greenudeltauv ):
ZZ
ZZ
G(x; ) () d d =
G(x; ) () d d +

I
() G(x; )

() ds.
+
G(x; )
n
n

Since the Greens function solves G = , the first term reproduces (x). Moreover,
G = 0 and = h on , and so this reduces to (14.72).
Q.E.D.
For example, applying (14.72) to the Greens function (14.70) for the unit disk recovers
the Poisson integral formula (14.44).
3/7/03

612

c 2003

Peter J. Olver

14.4. Adjoints and Minimum Principles.


We shall now explain how the Laplace and Poisson equations fit into our universal
self-adjoint equilibrium framework. The one-dimensional version of the Poisson equation,

d2 u
= f,
dx2

is the equilibrium equation for a uniform elastic bar. In Section 10.3, we wrote the underlying boundary value problems in self-adjoint form D D[ u ] = f based on the derivative
operator Du = u0 and its adjoint D = D with respect to the standard L2 inner product.
For the two-dimensional Poisson equation
[ u ] =

2u 2u
2 = f (x, y)
x2
y

the role of the one-dimensional derivative D will be played by the gradient operator

ux
u = grad u =
.
uy
The gradient maps a scalar-valued function u(x, y) to the vector-valued function consisting of its two first order partial derivatives. Thus, its domain is the vector space
U = C1 (, R) consisting of all continuously differentiable functions u(x, y) defined for
(x, y) . The target space V = C0 (, R 2 ) consists of all continuous vector-valued
T
functions v(x, y) = ( v1 (x, y), v2 (x, y) ) , known as vector fields. (By way of analogy,
scalar-valued functions are sometimes referred to as scalar fields.) The gradient defines a
linear map
: U V
from scalar fields to vector fields. Indeed, if u1 , u2 U are any two scalar functions and
c1 , c2 R any constants, then
(c1 u1 + c2 u2 ) = c1 u1 + c2 u2 ,
which is the requirement for linearity of Definition 7.1.
In accordance with the general Definition 7.43, the adjoint of the gradient must go in
the reverse direction,
: V U,
mapping vector fields v(x, y) to scalar functions z(x, y) = v. The defining equation for
the adjoint
hh u ; v ii = h u ; v i
(14.73)
requires inner products on the two vector spaces. The simplest inner product between
real-valued scalar functions u(x, y), v(x, y) defined on a domain R 2 is given by the
double integral
ZZ
hu;vi =

u(x, y) v(x, y) dx dy.

(14.74)

3/7/03

613

c 2003

Peter J. Olver

As in the one-dimensional case (3.8), this is often referred to as the L 2 inner product
between scalar fields, with associated norm
ZZ
2
kuk = hu;ui =
u(x, y)2 dx dy.

More generally, the L2 inner product between vector-valued functions (vector fields) defined
on is obtained by integrating their usual dot product:
ZZ
ZZ

v1 (x, y) w1 (x, y) + v2 (x, y) w2 (x, y) dx dy.


hh v ; w ii =
v(x, y) w(x, y) dx dy =

(14.75)
These form the two most basic inner products on the spaces of scalar and vector fields,
and are the ones required to place the Laplace and Poisson equations in self-adjoint form.
The adjoint identity (14.73) is supposed to hold for all appropriate scalar fields u and
vector fields v. For the L2 inner products (14.74), (14.75), the two sides of the identity
read
ZZ
ZZ
u
u
hh u ; v ii =
+ v2
dx dy,
u v dx dy =
v1
x
y

Z Z
h u ; v i =
u v dx dy.

Thus, to equate these two double integrals, we need to remove the derivatives from the
scalar field u. As in the one-dimensional computation (10.68), the secret is integration by
parts.
As the student may recall, integration by parts is an immediate consequence of the
Fundamental Theorem of Calculus when applied to Leibnizs rule for the derivative of the
product of two functions. Now, according to Chapter A, Greens Theorem A.25 plays the
role of the fundamental theorem in two-dimensional calculus. We will find the divergence
form
ZZ
I
v dx dy =
v n ds.
(14.76)

the more convenient for the present purposes. In analogy with the one-dimensional argument, we now replace the vector field v by the product u v of a scalar field u and a vector
field v. An elementary computation proves that
(u v) = u v + u v.

(14.77)

As a result, we deduce what is usually known as Greens formula


ZZ
I

u v + u v dx dy =
u (v n) ds,

(14.78)

which is valid for arbitrary bounded domains , and arbitrary scalar and vector fields
defined thereon. Rearranging the terms in this integral identity produces the required
integration by parts formula for double integrals:
ZZ
I
ZZ
u v dx dy =
u (v n) ds
u v dx dy.
(14.79)

3/7/03

614

c 2003

Peter J. Olver

The first term on the right hand side of this identity is a boundary term, just like the first
terms on the right hand side of the one-dimensional integration by parts formula (10.71).
Moreover, the derivative operation has moved from a gradient on the scalar field to a
divergence on the vector field in the double integral on the right even the minus sign is
there!
Now, The left hand side in the integration by parts formula (14.79) is the same as the
left hand side of (14.73). If the boundary integral vanishes,
I
u v n ds = 0,
(14.80)

then the right hand side of formula (14.79) also reduces to an L2 inner product
ZZ
ZZ

u v dx dy =
u ( v) dx dy = h u ; v i

between the scalar field u and minus the divergence of the vector field v. Therefore, subject
to the boundary constraint (14.80), the integration by parts formula reduces to the inner
product identity
hh u ; v ii = h u ; v i.
(14.81)
Comparing (14.73), (14.81), we conclude that v = v, and hence the adjoint of
the gradient operator is minus the divergence, = . In this manner, we are able to
write the two-dimensional Poisson equation in the standard self-adjoint form
u = u = (u) = f

(14.82)

subject to an appropriate system of boundary conditions that justify (14.81).


The vanishing of the boundary integral (14.80) will be ensured by the imposition of
suitable homogeneous boundary conditions on the scalar field u and/or the vector field
v. Clearly the line integral will vanish if either u = 0 or v n = 0 at each point on the
boundary. These lead immediately to the three principle types of boundary conditions.
The first are the fixed or Dirichlet boundary conditions, which require
u=0

on

(14.83)

Alternatively, we can require


vn=0

on

(14.84)

which requires that v be tangent to at each point, and so there is no net flux across
the (solid) boundary. If we identify v = u, then the no flux boundary condition (14.84)
translates into the Neumann boundary conditions
u
= u n = 0
on
.
(14.85)
n
One can evidently also mix the boundary conditions, imposing Dirichlet conditions on part
of the boundary, and Neumann on the complementary part:
u=0
3/7/03

on D,

u
= 0 on
n

N,
615

where

= D N
c 2003

(14.86)

Peter J. Olver

is the disjoint union of the Dirichlet and Neumann parts.


To model inhomogeneous membranes, heat flow through inhomogeneous media, and
similar physical equilibria, we replace the L2 inner product between vector fields by the
weighted version
ZZ

hh v ; w ii =
p(x, y) v1 (x, y) w1 (x, y) + q(x, y) v2 (x, y) w2 (x, y) dx dy,
(14.87)

in which p(x, y), q(x, y) > 0 are strictly positive functions on the domain (x, y) .
Retaining the usual L2 inner product (14.74) between scalar fields, let us compute the
weighted adjoint of the gradient operator. Using the same basic defining formula (14.73),
we compute
ZZ
u
u
+ q v2
dx dy.
hh u ; v ii =
p v1
x
y

We then apply the same integration by parts formula (14.79) to remove the derivatives
from the scalar field u, leading to
I
ZZ

u
u
u q v2 dx + u p v1 dy
+ q v2
dx dy =
p v1
x
y

ZZ
(14.88)
(p v1 ) (q v2 )

u
dx dy.
+
x
y

Equating this to the right hand side h u ; v i, we deduce that, provided the boundary
integral vanishes, the weighted adjoint of the gradient operator with respect to (14.87) is
given by
v =

p
q
(p v1 ) (q v2 )
v
v

= p 1 q 2 v1
v2
.
x
y
x
y
x
y

(14.89)

The boundary integral in (14.88) vanishes provided either u = 0 or v = 0 on . Therefore,


the same homogeneous boundary conditions Dirichlet, Neumann or mixed are still
applicable.
The corresponding self-adjoint boundary value problem takes the form

u =
p(x, y)

q(x, y)
= f (x, y),
(x, y) , (14.90)
x
x
x
x
along with either homogeneous or inhomogeneous boundary conditions of either Dirichlet, Neumann or mixed type. The weight functions p, q are prescribed by the physical
inhomogeneities in the body.
Positive Definiteness and the Dirichlet Principle
In conclusion, as a result of the integration by parts calculation, we have successfully
formulated the Poisson and Laplace equations (as well as their weighted couterparts) in
self-adjoint form
u = u = f,
3/7/03

616

c 2003

Peter J. Olver

including either Dirichlet, Neumann, or mixed boundary conditions. A key benefit of the
formulation of a system in self-adjoint form is, in the positive definite cases, the characterization of the solutions by a minimization principle.
According to Theorem 7.51, the self-adjoint operator is positive definite if and
only if the kernel of the underlying gradient operator restricted to the appropriate space
of scalar fields is trivial: ker = {0}. The determination of the kernel of the gradient
operator relies on the following elementary fact, which is the multi-variable version of the
result that the only function with zero derivative is a constant.
Lemma 14.12. If u(x, y) is a C1 function defined on a connected domain, then
u 0 if and only if u c is a constant.
This result is a simple consequence of Theorem A.20; see Exercise . Therefore, the
only functions which could show up in ker , and thus prevent positive definiteness, are
the constants. The boundary conditions will tell us whether or not this occurs. The
only constant function that satisfies either homogeneous Dirichlet or homogeneous mixed
boundary conditions is the zero function, and thus, just as in the one-dimensional case,
the boundary value problem for the Poisson equation with Dirchlet or mixed boundary
conditions is positive definite. On the other hand, any constant function satisfies the
homogeneous Neumann boundary conditions, and hence such boundary value problems
are only positive semi-definite.
In the positive definite definite cases, when ker = {0} as dictated by the boundary
conditions the equilibrium solution can be characterized by our basic minimization
principle based on the general formula (7.57). For the Poisson equation, the resulting
quadratic functional is the justly famous Dirichlet principle.
Theorem 14.13. The solution u(x, y) to the Poisson equation (14.3) subject to
either homogeneous Dirichlet or mixed boundary conditions is characterized as the unique
function that minimizes the Dirichlet integral
ZZ

1 2 1 2
2
1
k
u
k

h
u
;
f
i
=
u
+
u

f
u
dx dy
(14.91)
2
2 x
2 y

among all C functions that satisfy the prescribed boundary conditions.

In physical applications the Dirichlet integral (14.91) represents the energy in the
system. Hence, just as in discrete and one-dimensional mechanics, Nature chooses the
equilibrium configuration so as to minimize the energy. The application of this minimum
principle for numerical approximation to the solutions based on the finite element approach
will form the subject of Section 14.5.
Remark : Theorem 14.13 only says that if a minimum is achieved, then it must satisfy
the boundary value problem. It does not actually guarantee the existence of a minimizer,
and hence a solution to the boundary value problem. Dirichlet originally thought this to
be self-evident, but it was later realized that the proof of existence is a rather difficult
analytical theorem. It took about 50 years from Dirichlets statement of his principle until
Hilbert supplied the first rigorous existence proof. In applications, it is certainly comforting
3/7/03

617

c 2003

Peter J. Olver

to know that there is a solution to the boundary value problem. In this introductory
treatment, we adopt a more pragmatic approach, concentrating on the computation of the
solution reassured, if necessary, by the theoreticians efforts in establishing the existence
of the solution.
The Dirichlet minimization principle (14.91) was derived under the assumption that
the boundary conditions are homogeneous either pure Dirichlet or mixed. As it turns
out, the principle also applies to inhomogeneous Dirichlet boundary conditions as stated.
However, if we have inhomogeneous Neumann conditions on part of the boundary, then
we must include an additional boundary term in the minimizing functional. The general
result can be stated as follows:
Theorem 14.14. The solution u(x, y) to the boundary value problem
u = f

in ,

u=h

on

D,

u
=k
n

on N,

with = D N , and D 6= , is characterized as the unique function that minimizes the


modified Dirichlet integral
Z
ZZ

1
2
+
u k ds dx dy
(14.92)
2 k u k f u
N

among all C1 functions that satisfy the prescribed boundary conditions.

The inhomogeneous Dirichlet problem has N = and D = , in which case the


boundary integral does not appear. An outline of the proof of this result appears in the
exercises.
While the Dirichlet and mixed boundary value problems are positive definite, any
constant function satisfies the homogeneous Neumann boundary conditions, and so in this
case ker consists of all constant functions. Therefore, just as in the one-dimensional bar,
the Neumann boundary value problem is only positive semi-definite, and so we cannot
construct a minimization principle. Indeed, when the system is only positive semi-definite,
the solution is not unique: if u(x, y) is a solution, so is u(x, y) + c for any constant c.
As we know, positive definiteness is directly related to the stability of the physical
system. The Dirichlet and mixed boundary value problems are stable, and can support
any imposed force. On the other hand, the pure Neumann boundary value problem is
unstable, owing to the existence of a nontrivial kernel the constant functions. Physically,
the unstable mode represents a rigid translation of the entire membrane in the vertical
direction. Indeed, the Neumann problem leaves the entire boundary of the membrane
unattached to any support, and so the unforced membrane is free to move up or down
without affecting its equilibrium status.
Furthermore, non-uniqueness and non-existence of solutions go hand in hand. As we
learned in Section 10.3, the existence of a solution to a Neumann boundary value problem
relies on the Fredholm alternative, suitably adapted to this multi-dimensional situation. A
necessary condition for the existence of a solution is that the forcing function be orthogonal
to the elements of the kernel of the underlying self-adjoint linear operator, which, in the
3/7/03

618

c 2003

Peter J. Olver

present situation requires that f be orthogonal to the subspace consisting of all constant
functions. In practical terms, we only need to check orthogonality with respect to a basis
for the subspace, which in this situation consists of the constant function 1. The fact that,
under such conditions, a solution actually exists is harder, and we refer to [31] for details
of the existence part of the following result.
Theorem 14.15. The Neumann boundary value problem
u = f,

in

u
= 0,
n

on

(14.93)

has a solution u(x, y) if and only if


h1;f i =

ZZ

f (x, y) dx dy = 0.

(14.94)

Moreover, the solution is not unique since any function of the form u(x, y) + c, where c R
is an arbitrary constant, is also a solution.
Forcing functions f (x, y) which do not satisfy the orthogonality constraint (14.94) will
excite the translational instability, and no equilibrium configuration is possible. For example, if we force a free membrane, (14.94) requires that the net force in the vertical direction
be zero; otherwise, the membrane will start moving and cannot be in an equilibrium.

14.5. Finite Elements.


As the reader has no doubt already guessed, explicit solutions to boundary value
problems for the Laplace and Poisson equations are few and far between. In most cases,
exact solution formulae are not available, or are so complicated as to be of scant utility.
To proceed further, one is forced to design suitable numerical approximation schemes that
can accurately evaluate the desired solution and thereby aid in the study of its behavior.
The most powerful class of numerical algorithms for solving general elliptic boundary
value problems are the finite element methods. We have already learned, in Section 10.6,
the key underlying idea. One begins with a minimization principle, prescribed by a quadratic functional defined on a suitable vector space of functions U that serves to incorporate
the (homogeneous) boundary conditions. The desired solution is characterized as the
unique minimizer u? U . One then restricts the functional to a suitably chosen finitedimensional subspace W U , and seeks a minimizer w? W . Finite-dimensionality of
W has the effect of reducing the infinite-dimensional minimization problem to a finitedimensional problem, which can then be solved by numerical linear algebra. The resulting
minimizer w? will provided the subspace W has been cleverly chosen provide a good
approximation to the true minimizer u? on the entire domain. Here we concentrate on the
practical design of the finite element procedure, and refer the reader to a more advanced
text, e.g., [FE], for the analytical details and proofs of convergence. Most of the multidimensional complications are not in the underlying theory, but rather in the realms of
data management and organizational details.
3/7/03

619

c 2003

Peter J. Olver

Figure 14.9.

Triangulation of a Planar Domain and Piecewise Affine Function.

In this section, we first concentrate on applying these ideas to the two-dimensional


Poisson equation. For specificity, we first treat the homogeneous Dirichlet boundary value
problem
u = f in
u = 0 on .
(14.95)
According to Theorem 14.13, the solution u = u? is characterized as the unique minimizing function for the Dirichlet functional (14.91) among all smooth functions u(x, y)
that satisfy the prescribed boundary conditions. In the finite element approximation, we
restrict the Dirichlet functional to a suitably chosen finite-dimensional subspace. As in
the one-dimensional situation, the most convenient finite-dimensional subspaces consist of
functions that may lack the requisite degree of smoothness that qualifies them as possible
solutions to the partial differential equation. Nevertheless, they do provide good approximations to the actual solution. An important practical consideration is to use functions
with small support, cf. Definition 12.5. The resulting finite element matrix will then be
sparse and the solution to the linear system can be relatively rapidly calculate, usually by
application of an iterative numerical scheme such as the GaussSeidel or SOR methods
discussed in Chapter 9.
Finite Elements and Triangulation
For one-dimensional boundary value problems, the finite element construction rests on
the introduction of a mesh a = x0 < x1 < < xn = b on the interval of definition. The
mesh nodes xk break the interval into a collection of small subintervals. In two-dimensional
problems, a mesh consists of a finite number of points xk = (xk , yk ), k = 1, . . . , m, known
as nodes, usually lying inside the domain R 2 . As such, there is considerable freedom
in the choice of mesh nodes, and completely uniform spacing is often not possible. We
regard the nodes as forming the vertices of a triangulation of the domain , consisting of
a finite number of small triangles, which we denote by T1 , . . . , TN . The nodes are split
into two categories interior nodes and boundary nodes, the latter lying on or close to
the boundary of the domain. A curved boundary is approximated by the polygon through
3/7/03

620

c 2003

Peter J. Olver

Figure 14.10.

Finite Element Pyramid Function.

the boundary nodes formed by the sides of the triangles lying on the edge of the domain;
see Figure 14.9 for a typical example. Thus, in computer implementations of the finite
element method, the first ingredient is a routine that will automatically triangulate a
specified domain in some reasonable manner; see below for details on what reasonable
entails.
As in our one-dimensional finite element construction, the functions w(x, y) in the
finite-dimensional subspace W will be continuous and piecewise affine. Piecewise affine
means that, on each triangle, the graph of w is flat, and so has the formula
w(x, y) = + x + y,

for

(x, y) T .

(14.96)

Continuity of w requires that its values on a common edge between two triangles must
agree, and this will impose certain compatibility conditions on the coefficients , ,
and , , associated with adjacent pairs of triangles T , T . The graph of z = w(x, y)
forms a connected polyhedral surface whose triangular faces lie above the triangles in the
domain; see Figure 14.9 for an illustration.
The next step is to choose a basis of the subspace of piecewise affine functions for the
given triangulation. As in the one-dimensional version, the most convenient basis consists
of pyramid functions k (x, y) which have the value 1 at a single node xk , and zero at all
the other nodes; thus

1,
i = k,
k (xi , yi ) =
(14.97)
0,
i 6= k.
Note that k will be nonzero only on those triangles which have the node xk as one of
their vertices, and hence the graph of k looks like a pyramid of unit height sitting on a
flat plane, as illustrated in Figure 14.10.
The pyramid functions k (x, y) corresponding to the interior nodes xk satisfy the
homogeneous Dirichlet boundary conditions on the boundary of the domain or, more
correctly, on the polygonal boundary of the triangulated domain, which is supposed to
be a good approximation to the curved boundary of the original domain . Thus, the
finite-dimensional finite element subspace W will be spanned by the interior node pyramid

Here and subsequently, the index is a superscript, not a power!

3/7/03

621

c 2003

Peter J. Olver

functions. A general element w W is a linear combination thereof, so


w(x, y) =

n
X

ck k (x, y),

(14.98)

k=1

where the sum ranges over the n interior nodes of the triangulation. Owing to the original
specification (14.97) of the pyramid functions, the coefficients
ck = w(xk , yk ) u(xk , yk ),

k = 1, . . . , n,

(14.99)

are the same as the values of the finite element approximation w(x, y) at the interior
nodes. This immediately implies linear independence of the pyramid functions, since the
only linear combination that vanishes at all nodes is the trivial one c 1 = = cn = 0. Thus,
the interior node pyramid functions k 1, . . . n form a basis for finite element subspace W ,
which therefore has dimension equal to n, the number of interior nodes.
The explicit formulae for the finite element basis functions are not difficult to determine. On one of the triangles T that has xk as a vertex, k (x, y) will be the unique affine
function (14.96) that takes the value 1 at the vertex xk and 0 at the other two vertices
xl , xm . Thus, we need a formula for an affine function or element
k (x, y) = k + k x + k y,

(x, y) T ,

(14.100)

that takes the prescribed values


k (xk , yk ) = 1,

k (xi , yi ) = k (xj , yj ) = 0,

at three specified points. These three conditions lead to the linear system
k (xi , yi ) = k + k xi + k yi = 0,
k (xj , yj ) = k + k xj + k yj = 0,
k (xk , yk )

k xk

k yk

(14.101)

= 1.

The solution produces the explicit formulae


k =

x i yj x j yi
,

k =

for the coefficients. The denominator

= det 1
1

yi y j
,

xi
xj
xk

k =

xj x i
,

yi
yj = 2 area T
yk

(14.102)

(14.103)

is, up to sign, twice the area of the triangle T ; see Exercise .

Cramers Rule (Cramer3 ) comes in handy here

3/7/03

622

c 2003

Peter J. Olver

Figure 14.11.

Square Mesh Triangulations.

Example 14.16. Consider an isoceles right triangle T with vertices


x1 = (0, 0),

x2 = (1, 0),

x3 = (0, 1).

Using equations (14.102), (14.103) (or solving the linear systems (14.101) directly), we
immediately produce the three affine elements
1 (x, y) = 1 x y,

2 (x, y) = x,

3 (x, y) = y.

(14.104)

They are defined so that k equals 1 at the vertex xk and is zero at the other two vertices.
The finite element pyramid function is then obtained by piecing together the individual
affine elements:

if (x, y) T which has xk as a vertex,
k (x, y),
(14.105)
k (x, y) =
0,
otherwise.
Continuity of k (x, y) is ensured by the fact that the constituent affine elements have the
same values at common vertices. The support of the finite element basis function (14.105)
is the polygon
[
T
(14.106)
supp k = Pk =

consisting of all the triangles T that have the node xk as a vertex. Thus, k (x, y) = 0
whenever (x, y) 6 Pk . We will call Pk the k th vertex polygon. The node xk lies on the
interior of its vertex polygon Pk , while the vertices of Pk are all the adjacent vertices that
are connected to xk by an edge of the triangulation. In Figure vpol the shaded regions
are two of the vertex polygons for the triangulation in Figure 14.9.
Example 14.17. The simplest, and most common triangulations are based on
regular meshes. Suppose that the nodes lie on a square grid, and so are of the form
xi,j = (i h + a, j h + b) where h > 0 is the inter-node spacing, and (a, b) is an overall
offset. If we choose the triangles to all have the same orientation, as in the first picture in
Figure 14.11, then the vertex polygons all have the same shape, consisting of 6 triangles
3/7/03

623

c 2003

Peter J. Olver

of total area 3 h2 the shaded region. On the other hand, if we choose an alternating,
perhaps more sthetically pleasing triangulation as in the second picture, then there are
two types of vertex polygons. The first, consisting of four triangles, has area 2 h 2 , while
the second, containing 8 triangles, has twice the area, 4 h2 . In practice, there are good
reasons to prefer the former triangulation; see below.
In general, in order to ensure convergence of the finite element solution to the true
minimizer, one should choose a triangulation with the following properties:
(a) The triangles are not too long and skinny. In other words, the sides should have
comparable lengths. In particular, obtuse triangles should be avoided.
(b) The areas of nearby triangles T should not vary too much.
(c) The areas of nearby vertex polygons Pk should also not vary too much.
For adaptive or variable meshes, one might very well have wide variations in area over the
entire grid, with small triangles in regions of rapid change in the solution, and large ones in
less interesting regions. But, overall, the sizes of the triangles and vertex polygons should
not dramatically vary as one moves across the domain.
The Finite Element Equations
We now seek to approximate the solution to the homogeneous Dirichlet boundary
value problem by restricting the Dirichlet functional to the finite element subspace W .
Substituting the formula (14.98) for a general element of W into the quadratic Dirichlet
functional (14.91) and expanding, we find
n
!
" n
# ZZ n
!2
X
X
X

ci i dx dy
f
c i i =
ci i
P[ w ] = P
i=1

i=1

i=1

n
n
X
1 X
b c =
=
k c c
2 i,j = 1 ij i j i = 1 i i

1
2

cT K c bT c.
T

Here K = (kij ) is a symmetric n n matrix, while b = ( b1 , b2 , . . . , bn ) is a vector with


respective entries
ZZ
kij = h i ; j i =
i j dx dy,

ZZ
(14.107)
bi = h f ; i i =
f i dx dy.

Thus, to determine the finite element approximation, we need to minimize the quadratic
function
P (c) = 12 cT K c bT c
(14.108)
T

over all possible choices of coefficients c = ( c1 , c2 , . . . , cn ) R n , i.e., over all possible


function values at the interior nodes.
Restricting to the finite element subspace has reduced us to a standard finite-dimensional
quadratic minimization problem. First, the coefficient matrix K > 0 is positive definite
due to the positive definiteness of the original functional; the proof in Section 10.6 is easily
3/7/03

624

c 2003

Peter J. Olver

adapted to the present situation. Theorem 4.2 tells us that the minimizer is obtained by
solving the associated linear system
K c = b.
(14.109)
The solution to (14.109) is effected by either Gaussian elimination or an iterative technique.
To find explicit formulae for the matrix coefficients kij in (14.107), we begin by noting
that the gradient of the affine element (14.100) is equal to

1
k
yi y j

=
k (x, y) = ak =
,
(x, y) T ,
(14.110)
k
xj x i
which is a constant vector within the triangle T ; outside it, k = 0 is zero. Therefore,
(
k = ak ,
if (x, y) T which has xk as a vertex,
(14.111)
k (x, y) =
0,
otherwise.
Actually, (14.111) is not quite right since if (x, y) lies on the boundary of a triangle T ,
then the gradient does not exist. However, this technicality will not cause any difficulty
in evaluating the ensuing integral. Thus, k reduces to a piecewise constant function on
the triangulation.
We will approximate integrals over the domain by integrals over the triangles,
which assumes that the polygonal boundary of the triangulation is a reasonably close
approximation to the true boundary . In particular,
X ZZ
X

kij
i j dx dy
kij
.
(14.112)
T

Now, according to (14.111), one or the other gradient in the integrand will vanish on the
entire triangle T unless both xi and xj are vertices. Therefore, the only terms contributing
to the sum are those triangles T that have both xi and xj as vertices. If i 6= j there are only
two such triangles, while if i = j every triangle in the ith vertex polygon Pi contributes.
The individual summands are easily evaluated, since the gradients are constant on the
triangles, and so
ZZ

=
kij

ai aj dx dy = ai aj area T .

Let T have vertices xi , xj , xk . Then, by (14.110), (14.111) and (14.103),


(yj yk )(yk yi ) + (xk xj )(xi xk )
(xi xk ) (xj xk )
area T =
, i 6= j,
2
( )
4 area T
k x j x k k2
(yj yk )2 + (xk xj )2

area
T
=
, where area T = 21 | |. (14.113)
kii
=

( )2
4 area T

=
kij

In this manner, each triangle T is associated with a collection of 6 different coefficients,

, known as the elemental stiffnesses of T . The indices i, j range over the three
= kji
kij
different vertices of the triangle T . In practice, one assembles the elemental stiffnesses
3/7/03

625

c 2003

Peter J. Olver

into a symmetric 3 3 matrix S , known as the elemental stiffness matrix of the triangle,
whose rows and columns are labeled by its vertices xi , xj , xk .
Interestingly, the elemental stiffnesses depend only on the angles of the triangle and
not on its size. Thus, similar triangles have the same elemental stiffness matrix provided
their vertices are labeled in the same order. Indeed, if we denote the angle in T at the
vertex xk by k , then, according to Exercise ,

while
kij
= kji
= 12 cot k , i 6= j,
(14.114)
kii
= 21 cot k + cot j ,
depend only upon the cotangents of the angles, and hence the elemental stiffness matrix
has the form

cot k
cot j
cot j + cot k
1

(14.115)
S =
.
cot k
cot i + cot k
cot i
2
cot i + cot j
cot i
cot j

One can use either formula for the elemental stiffness matrix. Equation (14.113) is more
convenient when one is given the coordinates of its vertices, while (14.115) should be used
if one knows its angles.

Example 14.18. The right triangle with vertices (0, 0), (1, 0), (0, 1) has elemental
stiffness matrix

1
12 12

1
S = 12
(14.116)
0 .
2

1
2

1
2

The same holds for any other isoceles right triangle, as long as we chose the first vertex
to be at the right angle and the other two at the 45 angles. A different ordering of the
vertices will serve to permute the rows and columns of S. Similarly, an equilateral triangle
has all 60 angles, and so its elemental stiffness matrix is
1

1
1

0.577350
0.288675 0.288675
3
2 3
2 3

1
1

1
S=
0.288675 0.577350 0.288675 . (14.117)

2 3
3
2 3

2 3

1
2
3

1
3

0.288675

0.288675

0.577350

See Figure ts .

Assembling the Elements


Each elemental stiffness matrix will contribute, through the summation (14.112), to
the finite element coefficient matrix K. We begin by constructing a larger matrix K ,
which we call the full finite element matrix , of size m m where m is the total number
of nodes in our triangulation, including both interior and boundary nodes. The rows and
columns of K are labeled by the nodes xi . On the other hand, the three rows and columns
of an individual elemental stiffness matrix S are labeled by the vertices of its triangle T .

We let K = (kij
) denote the corresponding m m matrix containing the 9 entries of S
3/7/03

626

c 2003

Peter J. Olver

Figure 14.12.

The Oval Plate.

which are placed in the rows and columns corresponding to the vertices of the triangle T ;
all other entries of K are 0. For example, if T3 has vertices x2 , x3 , x6 , then the entries
of its elemental stiffness matrix S3 will appear in rows and columns with labels 2, 3, 6 of
the full matrix K3 . The resulting m m matrices are all summed together over all the
triangles,
N
X

K =
K ,
(14.118)
=1

P
to produce the full finite element matrix. As in (14.112), each entry k ij =
kij of K
will be a sum of elemental stiffnesses corresponding to all the triangles that have x i and
xj as vertices.
The full finite element matrix K is too large for our linear system (14.109) since its
rows and columns include all the nodes, whereas the finite element matrix K appearing
in (14.109) only refers to the n interior nodes. The reduced n n finite element matrix
K is simply obtained from K by deleting all rows and columns indexed by boundary
nodes, retaining only the elements kij when both xi and xj are interior nodes. (This may
remind the reader of our construction of the reduced incidence matrix for a structure in
Chapter 6.) For the homogeneous boundary value problem, this is all we require. However,
as we shall see, inhomogeneous boundary conditions are most easily handled by retaining
(part of) the full matrix K .
The easiest way to understand the construction is through a particular example.
Example 14.19. A metal plate has the shape of an oval running track, consisting
of a rectangle, with side lengths 1 m by 2 m, and two semicircular disks glued onto the
shorter ends, as sketched in Figure 14.12. The plate is subject to a heat source while its
edges are held at a fixed temperature. The problem is to find the equilibrium temperature
distribution within the plate. Mathematically, we must solve the Poisson equation u =
f with prescribed Dirichlet boundary conditions, for the equilibrium temperature u(x, y).
Let us describe how to set up the finite element approximation to such a boundary value
problem.
We begin with a very coarse triangulation of the plate, which will not give particularly
accurate results, but does serve to illustrate how to go about assembling the finite element
matrix. We divide the rectangular part of the plate into 8 right triangles, while each
semicircular end will be approximated by three equilateral triangles. The triangles are
numbered from 1 to 14 as indicated in Figure 14.13. There are 13 nodes in all, numbered
3/7/03

627

c 2003

Peter J. Olver

5
1

7
5

10

11

12
1

13
3

13

12

14

11
8

Triangles

10

Nodes

Figure 14.13.

A Coarse Triangulation of the Oval Plate.

as in the second figure. Only nodes 1, 2, 3 are interior, while the boundary nodes are labeled
4 through 13, going counterclockwise around the boundary starting at the top. Therefore,
the full finite element matrix K will have size 13 13, its rows and columns labeled by
all the nodes. The reduced matrix K appearing in the finite element equations (14.109)
consists of the upper left 3 3 submatrix of K .
Each triangle T will contribute the summand K to the matrix K , modifying the
nine entries kij indexed by the vertices of T . The values are extracted from the nine
entries of the elemental stiffness matrix for that triangle. For example, the first triangle
T1 is equilateral, and so has elemental stiffness matrix (14.117). Its vertices are labeled 1,
5, and 6, and therefore we place the entries of (14.117) in the rows and columns numbered
1, 5, 6 to form the summand

0.577350

0.

0.

0.288675

0.288675
K1 =

0.

0.

..

0.
0.
0.
0.
0.
0.
0.
..
.

0. 0. 0.288675
0. 0.
0.
0. 0.
0.
0. 0. 0.577350
0. 0. 0.288675
0. 0.
0.
0. 0.
0.
..
..
..
.
.
.

0.288675
0.
0.
0.2886750.
0.577350
0.
0.
..
.

0. 0. . . .
0. 0. . . .
0. 0. . . .
0. 0. . . .
0. 0. . . .
0. 0. . . .
0. 0. . . .
..
.. . .
.
.
.

where all the undisplayed entries in the full 13 13 matrix are 0. The next triangle T 2
has the same equilateral elemental stiffness matrix (14.117), but now its vertices are 1, 6, 7,
3/7/03

628

c 2003

Peter J. Olver

and so it will contribute

0.577350

0.

0.

0.288675

0.
K2 =

0.288675

0.

..

0.
0.
0.
0.
0.
0.
0.
..
.

0. 0. 0. 0.288675
0. 0. 0.
0.
0. 0. 0.
0.
0. 0. 0. 0.577350
0. 0. 0.
0.
0. 0. 0. 0.288675
0. 0. 0.
0.
..
..
..
..
.
.
.
.

0.288675
0.
0.
0.2886750.
0.
0.5773500.
0.
..
.

0.
0.
0.
0.
0.
0.
0.
..
.

...
...
...
...
...
...
...
..
.

Similarly for K3 , with vertices 1, 7, 8. On the other hand, triangle T4 is an isoceles right
triangle, and so has elemental stiffness matrix (14.116). Its vertices are labeled 1, 4, and
5, with vertex 5 at the right angle. Therefore, its contribution is

0.5 0. 0.
0.
0.5 0. 0. 0. . . .
0.
0. 0.
0.
0.
0. 0. 0. . . .

0.
0. 0.
0.
0.
0. 0. 0. . . .

0.
0. 0. 0.5 0.5 0. 0. 0. . . .

0.5 0. 0. 0.5 1.0 0. 0. 0. . . .

K4 = 0.
0. 0.
0.
0.
0. 0. 0. . . . .

0.
0. 0.
0.
0.
0. 0. 0. . . .

0.
0. 0.
0.
0.
0. 0. 0. . . .
.

..
..
..
..
..
..
.. . .
..
.
.
.
.
.
.
.
.
Note particularly how we need to permute the rows and columns of (14.116) in order to have
the vertices in the correct order. Continuing in this manner, we assemble 14 contributions
K1 , . . . , K14 , each with (at most) 9 nonzero entries. The full finite element matrix is the
sum
K = K1 + + K14 ,
and equals

K =

3.732

1.

0.7887

0.5774

0.5774

0.7887

3/7/03

1.
4.
1.
1.
0
0
0
0
1.
0
0
0
0

0
1.
3.732
0
0
0
0
0
0
0.7887
0.5774
0.5774
0.7887

0
1.
0
2.
0.5
0
0
0
0
0
0
0
0.5

0.7887
0
0
0.5
1.577
0.2887
0
0
0
0
0
0
0

629

0.5774
0
0
0
0.2887
1.155
0.2887
0
0
0
0
0
0

0.5774
0
0
0
0
0.2887
1.155
0.2887
0
0
0
0
0
c 2003

(14.119)

Peter J. Olver

Figure 14.14.

A Square Mesh for the Oval Plate.

0.7887
0
0
0
0
0
0.2887
1.577
0.5
0
0
0
0

0
1.
0
0
0
0
0
0.5
2.
0.5
0
0
0

0
0
0.7887
0
0
0
0
0
0.5
1.577
0.2887
0
0

0
0
0.5774
0
0
0
0
0
0
0.2887
1.155
0.2887
0

0
0
0.5774
0
0
0
0
0
0
0
0.2887
1.155
0.2887

0.7887

0.5

.
0

0.2887
1.577

Since only nodes 1, 2, 3 are interior nodes, the reduced finite element matrix only uses the
upper left 3 3 block of K , so

3.732 1.
0
(14.120)
K = 1.
4.
1.0 .
0
1. 3.732

With some practice, one can learn how to directly construct K, bypassing K entirely.
For a finer triangulation, the construction is similar, but the matrices become much
larger. The procedure can, of course, be automated. Fortunately, if we choose a very
regular triangulation, then we do not need to be nearly as meticulous in assembling the
stiffness matrices, since many of the entries are the same. The simplest case is when we use
a uniform square mesh, and so triangulate the domain into isoceles right triangles. This
is accomplished by laying out a relatively dense square grid over the domain R 2 . The
interior nodes are the grid points that fall inside the oval domain, while the boundary nodes
are all those grid points lying adjacent to one or more of the interior nodes. The interior
nodes will be near but not necessarily on the boundary . Figure 14.14 shows the nodes
in a square grid with intermesh spacing h = .2. While a bit crude in its approximation
of the boundary of the domain, this procedure does have the advantage of making the
construction of the associated finite element matrix relatively painless.
For such a mesh, all the triangles are isoceles right triangles, with elemental stiffness
matrix (14.116). Summing the corresponding matrices K over all the triangles, as in
(14.118), the rows and columns of K corresponding to the interior nodes are seen to all
have the same form. Namely, if i labels an interior node, then the corresponding diagonal
3/7/03

630

c 2003

Peter J. Olver

entry is kii = 4, while the off-diagonal entries kij = kji , i 6= j, are equal to either 1
when node i is adjacent to node j on the grid, and is equal to 0 in all other cases. Node
j is allowed to be a boundary node. (Interestingly, the result does not depend on how
one orients the pair of triangles making up each square of the grid, as in Figure 14.11; the
orientation only plays a role in the computation of the right hand side of the finite element
equation.) The same computation applies even to our coarse triangulation. The interior
node 2 belongs to all right isoceles triangles, and the corresponding entries in (14.119) are
k22 = 4, and k2j = 1 for the four adjacent nodes j = 1, 3, 4, 9.
Remark : Interestingly, the coefficient matrix arising from the finite element method
on a square (or even rectangular) grid is the same as the coefficient matrix arising from
a finite difference solution to the Laplace equation; see Exercise . The finite element
approach has the advantage of applying to much more general triangulations.
In general, while the finite element matrix K for a two-dimensional boundary value
problem is not as nice as the tridiagonal matrices we obtained in our one-dimensional
problems, it is still very sparse and, on regular grids, highly structured. This makes
solution of the resulting linear system particularly amenable to an iterative matrix solver
such as GaussSeidel, Jacobi, or, best of all, successive over-relaxation (SOR).
The Coefficient Vector and the Boundary Conditions
So far, we have been concentrating on assembling the finite element coefficient matrix
T
K. We also need to compute the forcing vector b = ( b1 , b2 , . . . , bn ) appearing on the right
hand side of the fundamental linear equation (14.109). According to (14.107), the entries
bi are found by integrating the product of the forcing function and the finite element basis
function. As before, we will approximate the integral over the domain by an integral
over the triangles, and so
bi =

ZZ

f i dx dy

X ZZ

f i dx dy

bi .

(14.121)

Typically, the exact computation of the various triangular integrals is not convenient,
and so we resort to a numerical approximation. Since we are assuming that the individual
triangles are small, we can adopt a very crude numerical integration scheme. If the function
f (x, y) does not vary much over the triangle T which will certainly be the case if T is
sufficiently small we may approximate f (x, y) ci for (x, y) T by a constant. The
integral (14.121) is then approximated by
bi

ZZ

f
T

dx dy

ci

ZZ

i (x, y) dx dy =

1
3

ci area T =

1
6

ci | |.

(14.122)

The formula for the integral of the affine element i (x, y) follows from solid geometry.
Indeed, it equals the volume of the solid under its graph, which is a tetrahedron of height
1 and base T ; see Figure fetet .
3/7/03

631

c 2003

Peter J. Olver

How to choose the constant ci ? In practice, the simplest choice is to let ci = f (xi , yi )
be the value of the function at the ith vertex. With this choice, the sum in (14.121) becomes
X
1
1
(14.123)
bi
3 f (xi , yi ) area T = 3 f (xi , yi ) area Pi ,

where Pi = T is the vertex polygon (14.106) corresponding to the node xi . In particular,


for the square mesh with the uniform choice of triangles, as in Example 14.17,
area Pi = 3 h2

for all i, and so

bi f (xi , yi ) h2

(14.124)

is well approximated by just h2 times the value of the forcing function at the node. In this
case, the finite element equations (14.109) are identical with the finite difference equations
based on the square grid; see Exercise . This is the underlying reason to choose the
uniform triangulation for the square mesh; the alternating version would give unequal
values for the bi over adjacent nodes, and this would introduce additional errors into the
final approximation.
Example 14.20. For the coarsely triangulated oval plate, the reduced stiffness matrix is (14.120). The Poisson equation u = 4 models a constant external heat source
of magnitude 4 over the entire plate. If we keep the edges of the plate fixed at 0 , then
we need to solve the finite element equation K c = b, where K is the coefficient matrix
(14.120), while

T
b = 34 2 + 3 4 3 , 2, 2 + 3 4 3
= ( 4.39872, 2.66667, 4.39872 ) .

The entries of b are, by (14.123), equal to 4 = f (xi , yi ) times one third the area of the
corresponding vertex polygon, which for node 2 is the square consisting of 4 right triangles,
each of area 12 , whereas for nodes 1 and 3 it consists of 4 right triangles of area 21 plus

three equilateral triangles, each of area 43 ; see Figure 14.13.


The solution to the final linear system is easily found:
T

c = ( 1.56724, 1.45028, 1.56724 ) .


Its entries are the values of the finite element approximation at the three interior nodes.
The finite element solution is plotted in the first illustration in Figure 14.15. A more
accurate solution, based on a square grid triangulation of size h = .1 is plotted in the
second figure.
Inhomogeneous Boundary Conditions
So far, we have restricted our attention to problems with homogeneous Dirichlet
boundary conditions. According to Theorem 14.14, the solution to the inhomogeneous
Dirichlet problem
u = f in ,
u = h on ,
is also obtained by minimizing the Dirichlet functional (14.91). However, now the minimization takes place over the affine subspace consisting of all functions that satisfy the
3/7/03

632

c 2003

Peter J. Olver

Figure 14.15.

Finite Element Solutions to Poissons Equation for an Oval Plate.

inhomogeneous boundary conditions. It is not difficult to fit this problem into the finite
element scheme.
The elements corresponding to the interior nodes of our triangulation remain as before,
but now we need to include additional elements to ensure that our approximation satisfies
the boundary conditions. Note that if xk is a boundary node, then the corresponding
boundary element k (x, y) satisfies the interpolation condition (14.97), and so has the
same piecewise affine form (14.105). The corresponding finite element approximation
w(x, y) =

m
X

ci i (x, y),

(14.125)

i=1

has the same form as before, (14.98), but now the sum is over all nodes, both interior
and boundary. As before, the coefficients ci = w(xi , yi ) u(xi , yi ) are the values of the
finite element approximation at the nodes. Therefore, in order to satisfy the boundary
conditions, we require
cj = h(xj , yj )

whenever

xj = (xj , yj )

is a boundary node.

(14.126)

Remark : If the boundary node xj does not lie precisely on the boundary , we need
to approximate the value h(xj , yj ) appropriately, e.g., by using the value of h(x, y) at the
nearest boundary point (x, y) .
The derivation of the finite element equations proceeds as before, but now there are
additional terms arising from the nonzero boundary values. Leaving the intervening details
to the reader, the final outcome can be written as follows. Let K denote the full m m
finite element matrix constructed as above. The reduced coefficient matrix K is obtained
by retaining the rows and columns corresponding to only interior nodes, and so will have
e
size n n , where n is the number of interior nodes. The boundary coefficient matrix K
is the n (m n) matrix consisting of the entries of the interior rows that do not appear
in K, i.e., those lying in the columns indexed by the boundary nodes. For instance, in the
the coarse triangulation of the oval plate, the full finite element matrix is given in (14.119),
and the upper 3 3 subblock is the reduced matrix (14.120). The remaining entries of the
3/7/03

633

c 2003

Peter J. Olver

Figure 14.16.

Solution to the Dirichlet Problem for the Oval Plate.

first three rows form the boundary coefficient matrix


e =
K

0 0.7887 0.5774 0.5774 0.7887

0
0
0
0
1.
0
0
0
0
0

0
0
0
0
0

1.
0
0
0
0
.
0 0.7887 0.5774 0.5774 0.7887

(14.127)
We similarly split the coefficients ci of the finite element function (14.125) into two groups.
We let c R n denote the as yet unknown coefficients ci corresponding to the values of the
approximation at the interior nodes xi , while h R mn will be the vector of boundary
values (14.126). The solution to the finite element approximation (14.125) is obtained by
solving the associated linear system
e h = b,
Kc + K

or

e h.
Kc = f = b K

(14.128)

The full justification of this system is left as an exercise for the reader.
Example 14.21. For the oval plate discussed in Example 14.19, suppose the right
hand semicircular edge is held at 10 , the left hand semicircular edge at 10 , while the
two straight edges have a linearly varying temperature distribution ranging from 10 at
the left to 10 at the right, as illustrated in Figure 14.16. Thus, for the coarse triangulation
we have the boundary nodes values
T

h = ( h4 , . . . , h13 ) = ( 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0 ) .
Using the previously computed formulae (14.120), (14.127) for the interior coefficient mae we approximate the solution to the Laplace
trix K and boundary coefficient matrix K,
equation by solving (14.128). Since there is no external forcing function, f (x, y) 0, the
e h = ( 2.18564, 3.6, 7.64974 )T .
right hand side is b = 0, and so we must solve K c = f = K
T
The finite element function corresponding to the solution c = ( 1.06795, 1.8, 2.53205 ) is
plotted in the first illustration in Figure 14.16. Even on such a coarse mesh, the approximation is not too bad, as evidenced by the second illustration, which plots the finite element
solution for a square mesh with spacing h = .2 between nodes.
3/7/03

634

c 2003

Peter J. Olver

Second Order Elliptic Boundary Value Problems


While the Laplace and Poisson equations are by far the most important elliptic partial
differential equations, they only model homogeneous media, e.g., membranes made out of
a uniform material, or heated plates with uniform (constant) heat capacity. Inhomogeneous media lead to more general self-adjoint differential operators, leading to variable
coefficient second order elliptic boundary value problems. Even more generally, elastic
shells, meaning bendable plates, lead to fourth order two-dimensional elliptic boundary
value problems similar to the one-dimensional beam equation (10.102). And, these are in
turn only linear approximations to the fully nonlinear elliptic boundary value problems
occurring in elasticity theory, [56]. The latter are beyond the scope of this text, although
some of the required mathematical tools appear in Chapter 20.
The most important class of linear, self-adjoint, second order, elliptic partial differential equations in two space variables take the form

p(x, y)

q(x, y)
+ r(x, y) u = f (x, y),
(x, y) , (14.129)

x
x
x
x
where p(x, y), q(x, y) > 0 are strictly positive functions, while r(x, y) 0 is non-negative.
For simplicity, we also impose homogeneous Dirichlet boundary conditions u = 0 on .
Note that the positivity conditions ensure that the partial differential equation is elliptic
in accordance with the classification of Definition 14.1.
The reader may notice that (14.129) is a two-dimensional version of the Sturm
Liouville ordinary differential equation (10.132). The self-adjoint formulation (10.135)
of a SturmLiouville boundary value problem serves to inspire the self-adjoint form

ux
L L[ u ] = f,
by setting
L[ u ] = uy ,
(14.130)
u

of the boundary value problem for (14.129). Note that the linear operator L: U V maps
the vector space U consisting of all smooth functions u(x, y) satisfying the homogeneous
Dirichlet boundary conditions to the vector space V consisting of all vector-valued functions
T
v = ( v1 (x, y), v2 (x, y), v3 (x, y) ) . We adopt the usual L2 inner product (14.74) on U , but
introduce a weighted inner product
ZZ

e ii =
p v1 ve1 + q v2 ve2 + r v3 ve3 dx dy
hh v ; v

on the vector space V . A straightforward computation based on Greens formula (14.78)


produces the weighted adjoint
L [ v ] =



p(x, y) v1 (x, y)
q(x, y) v2 (x, y) + r(x, y) v3 (x, y)
x
x

(14.131)

Technically, we should require that r(x, y) 6 0 not vanish on any open subdomain in order
that this define a nondegenerate inner product.

3/7/03

635

c 2003

Peter J. Olver

of the operator L. Therefore, the formula for the self-adjoint product


ux

L L[ u ] = L uy =
p(x, y)

q(x, y)
+ r(x, y) u(x, y)
x
x
x
y
u

proves the identification of (14.130) and (14.129). Positive definiteness follows from the
observation that ker L = {0}. The minimization principle associated with the operator L
is, as usual,
ZZ
1

2
2
2
2
1
1
1
(14.132)
P[ u ] = 2 k L[ u ] k h f ; u i =
2 p ux + 2 q uy + 2 r u f u dx dy.

As always, the solution to our boundary value problem is the unique minimizing function
for P[ u ] among all functions u U satisfying the homogeneous boundary conditions.

Remark : Interestingly, in contrast to the Poisson equation, if r > 0 the boundary


value problem for (14.129) is positive definite with minimization principle (14.132) even
in the case of pure Neumann boundary conditions. This is because the operator L always
has trivial kernel.
The finite element approximation is constructed as in the Poisson version by restricting the minimization principle to the finite-dimensional subspace spanned by the
finite element basis functions (10.150). This requires the solution of a linear system of the
same form (14.128), in which

ZZ
i j
i j
+q
+ r i j dx dy,
kij = h L[ i ] ; L[ j ] i =
p
x x
y y

ZZ
bi = h f ; i i =
f i dx dy.
(14.133)

As before, the double integrals are approximated by a sum of integrals over the triangles
T . The only triangles that contribute to the final result for kij are the ones that have both
xi and xj as vertices. When the triangles are small, the integrals can be approximated
by fairly crude numerical integration formulae. This completes our brief outline of the
method; full details are left to the reader.
Example 14.22. The Helmholtz equation is
u + u = 0,

(14.134)

along with suitable boundary conditions. As we shall learn in Chapter 16, the Helmholtz
equation governs the eigenvalues of the Laplacian, and as such forms the fundamental
modes of vibration of a wide variety of mechanical system, including the vibration of
plates, scattering of acoustic and electromagnetic waves, and many others.
If < 0, then the Helmholtz equation fits into the positive definite framework (14.129),
with p = q = 1 and r = . To solve the problem by finite elements, we restrict the
minimization principle
ZZ

1
2
2
1
(14.135)
P[ u ] =
2 k u k 2 u f u dx dy.

3/7/03

636

c 2003

Peter J. Olver

to the finite-dimensional finite element subspace determined by a triangulation of the


underlying domain. The resulting coefficient matrix has the form
ZZ

i j 21 i j dx dy
kij =

(14.136)
X ZZ
X

i j i j dx dy
kij .

are left as an exercise for the reader. The forcing vector


The explicit formulae for the kij
b has exactly the same form (14.107) as in the Poisson example.
Unfortunately, the most interesting cases are when > 0 and the boundary value
problem is not positive definite; nevertheless, the finite element approach can still give
quite respectable answers, even though it lacks a simple theoretical justification.

3/7/03

637

c 2003

Peter J. Olver

Chapter 15
Complex Analysis
The term complex analysis refers to the calculus of complex-valued functions f (z)
depending on a complex variable z. On the surface, it may seem that this subject should
merely be a simple reworking of standard real variable theory that you learned in first year
calculus. However, this nave first impression could not be further from the truth! Complex analysis is the culmination of a deep and far-ranging study of the fundamental notions
of complex differentiation and complex integration, and has an elegance and beauty not
found in the more familiar real arena. For instance, complex functions are always analytic, meaning that they can be represented as convergent power series. As an immediate
consequence, a complex function automatically has an infinite number of derivatives, and
difficulties with degree of smoothness, strange discontinuities, delta functions, and other
forms of pathological behavior of real functions never arise in the complex realm.
There is a remarkable, profound connection between harmonic functions (solutions of
the Laplace equation) of two variables and complex-valued functions. Namely, the real
and imaginary parts of a complex analytic function are automatically harmonic. In this
manner, complex functions provide a rich lode of new solutions to the two-dimensional
Laplace equation to help solve boundary value problems. One of the most useful practical
consequences arises from the elementary observation that the composition of two complex
functions is also a complex function. We interpret this operation as a complex changes
of variables, also known as a conformal mapping since it preserves angles. Conformal
mappings can be effectively used for constructing solutions to the Laplace equation on
complicated planar domains, and play a particularly important role in the solution of
physical problems. and so on.
Complex integration also enjoys many remarkable properties not found in its real
sibling. Integrals of complex functions are similar to the line integrals of planar multivariable calculus. The remarkable theorem due to Cauchy implies that complex integrals
are generally path-independent provided one pays proper attention to the complex
singularities of the integrand. In particular, an integral of a complex function around a
closed curve can be directly evaluated through the calculus of residues, which effectively
bypasses the Fundamental Theorem of Calculus. Surprisingly, the method of residues can
even be applied to evaluate certain types of definite real integrals.
In this chapter, we shall introduce the basic techniques and theorems in complex
analysis, paying particular attention to those aspects which are required to solve boundary
value problems associated with the planar Laplace and Poisson equations. Complex analysis is an essential tool in a surprisingly broad range of applications, including fluid flow,
elasticity, thermostatics, electrostatics, and, in mathematics, geometry, and even number
3/7/03

638

c 2003

Peter J. Olver

theory. Indeed, the most famous unsolved problem in all of mathematics, the Riemann hypothesis, is a conjecture about a specific complex function that has profound consequences
for the distribution of prime numbers .

15.1. Complex Variables.


In this section we shall develop the basics of complex analysis the calculus of
complex functions f (z). Here z = x + i y is a single complex variable and f : C is
a complex-valued function defined on a domain z C in the complex plane. Before
diving into this material, the reader should first review the basic material on complex
numbers in Section 3.6.
Any complex function can be written as a complex combination
f (z) = f (x + i y) = u(x, y) + i v(x, y),

(15.1)

of two real functions u, v of two real variables x, y, called, respectively, its real and imaginary parts, and written
u(x, y) = Re f (z),

and

v(x, y) = Im f (z).

(15.2)

For example, the monomial function f (z) = z 3 is written as


z 3 = (x + i y)3 = (x3 3 x y 2 ) + i (3 x2 y y 3 ),
and so
Re z 3 = x3 3 x y 2 ,

Im z 3 = 3 x2 y y 3 .

As we know, we can identify C with the real, two-dimensional plane R 2 , so that the
T
complex number z = x + i y C is identified with the real vector ( x, y ) R 2 . Based
on the identification C ' R 2 , we shall adopt the usual terminology from planar vector
calculus, e.g., domain, curve, etc., without alteration; see Chapter A for details. In this
manner, we may regard a complex function as particular type of real vector field that maps

x
u(x, y)
2
R
to the vector
v(x, y) =
R2.
(15.3)
y
v(x, y)
Not every real vector field qualifies as a complex function; the components u(x, y), v(x, y)
must satisfy certain fairly stringent requirements; see Theorem 15.3 below.
Many of the well-known functions appearing in real-variable calculus polynomials,
rational functions, exponentials, trigonometric functions, logarithms, and many others
have natural complex extensions. For example, complex polynomials
p(z) = an z n + an1 z n1 + + a1 z + a0

(15.4)

are complex linear combinations (meaning that the coefficients ak are allowed to be complex
numbers) of the basic monomial functions z k = (x+ i y)k . Similarly, we have already made

Not to mention that a solution will net you a cool $1,000,000.00. See http://www.claymath.org
for details on how to claim your prize.

3/7/03

639

c 2003

Peter J. Olver

sporadic use of complex exponentials such as ez = ex+ i y for solving differential equations.
Other examples will appear shortly.
There are several ways to motivate the link between harmonic functions u(x, y),
meaning solutions of the two-dimensional Laplace equation
u =

2u 2u
+ 2 ,= 0
x2
y

(15.5)

and complex functions. One natural starting point is to return to the dAlembert solution
(13.117) of the one-dimensional wave equation, which was based on the factorization
= t2 c2 x2 = (t c x ) (t + c x )
of the linear wave operator (13.105). The two-dimensional Laplace operator = x2 + y2
has essentially the same form, except for a minor change in sign . We cannot produce a
real factorization of the Laplace operator, but there is a complex factorization,
= x2 + y2 = (x i y ) (x + i y ),
into a product of two complex first order differential operators. The wave speed has now
become complex: c = i . Mimicking the solution formula (13.113) for the wave equation,
we expect that the solutions to the Laplace equation (15.5) should be expressed in the
form
u(x, y) = f (x + i y) + g(x i y),
(15.6)
i.e., a linear combination of functions of the complex variable z = x + i y and its complex
conjugate z = x i y. The functions f and g satisfy the first order complex partial
differential equations
f
g
g
f
=i
,
= i
,
(15.7)
x
y
x
y
and hence (15.6) does indeed define a complex-valued solution to the Laplace equation.
In most applications, we are searching for a real solution to the Laplace equation, and
so our dAlembert-type formula (15.6) is not entirely satisfactory. As we know, a complex
number z = x + i y is real if and only if it equals its own conjugate, z = z. Thus, the
solution (15.6) will be real if and only if
f (x + i y) + g(x i y) = u(x, y) = u(x, y) = f (x + i y) + g(x i y).
Now, the complex conjugation operation switches x + i y and x i y, and so we expect the
first term f (x + i y) to be a function of x i y, while the second term g(x i y) will be a

A reader uninterested in the motivation can skip ahead to Proposition 15.1 at this point.

However, the change in sign has dramatic consequences for the analytical properties of solutions to the two equations. According to Section 14.1, there is a vast difference between the
elliptic Laplace equation and the hyperbolic wave equation.

3/7/03

640

c 2003

Peter J. Olver

function of x + i y. Therefore , to equate the two sides of this equation, we should require
g(x i y) = f (x + i y),
and so
u(x, y) = f (x + i y) + f (x + i y) = 2 Re f (x + i y).
Dropping the inessential factor of 2, we conclude that a real solution to the two-dimensional
Laplace equation can be written as the real part of a complex function. A direct proof of
the following key result will appear below.
Proposition 15.1. If f (z) is a complex function, then its real part
u(x, y) = Re f (x + i y)

(15.8)

is a harmonic function.
The imaginary part of a complex function is also harmonic. This is because
Im f (z) = Re ( i f (z))
is the real part of the complex function
i f (z) = i [ u(x, y) + i v(x, y) ] = v(x, y) i u(x, y).
Therefore, if f (z) is any complex function, we can write it as a complex combination
f (z) = f (x + i y) = u(x, y) + i v(x, y),
of two real harmonic functions u(x, y) = Re f (z) and v(x, y) = Im f (z).
Before delving into the many remarkable properties of complex functions, we look at
some of the most basic examples. In each case, the reader can check directly that the
harmonic functions given as the real and imaginary parts of the complex function are
indeed solutions to the Laplace equation.
Examples of Complex Functions
(a) Harmonic Polynomials: The simplest examples of complex functions are polynomials. Any polynomial is a complex linear combinations, as in (15.4), of the basic complex
monomials
z n = (x + i y)n = un (x, y) + i vn (x, y).
(15.9)
The real and imaginary parts of a complex polynomial are known as harmonic polynomials,
and we list the first few below. The general formula for the basic harmonic polynomials
un (x, y) and vn (x, y) is easily found by applying the binomial theorem, as in Exercise .

Harmonic Polynomials

We are ignoring the fact that f and g are not quite uniquely determined since one can add
and subtract a constant from them. This does not affect the argument in any significant way.

3/7/03

641

c 2003

Peter J. Olver

zn

un (x, y)

vn (x, y)

0
1
2
3
4
..
.

1
x + iy
2
(x y 2 ) + 2 i x y
(x3 3 x y 2 ) + i (3 x2 y y 3 )
(x4 6 x2 y 2 + y 4 ) + i (4 x3 y 4 x y 3 )
..
.

1
x
2
x y2
x3 3 x y 2
x4 6 x 2 y 2 + y 4
..
.

0
y
2xy
2
3 x y y3
4 x3 y 4 x y 3
..
.

We have, in fact, already encountered these polynomial solutions to the Laplace equation. If we write
z = r e i ,
(15.10)
where
r = |z| =

x2 + y 2 ,

= ph z = tan1

y
,
x

are the usual polar coordinates (modulus and phase) of z = x + i y, then Eulers formula (3.74) yields
z n = rn e i n = rn cos n + i r n sin n ,
and so
un = rn cos n ,

vn = rn sin n .

Therefore, the harmonic polynomials are just the polar coordinate solutions (14.35) to
the Laplace equation we obtained previously by the method of separation of variables. In
Figure z2z3 we plot the real and imaginary parts of the monomials z 2 and z 3 .
(b) Rational Functions: Ratios
f (z) =

p(z)
q(z)

(15.11)

of complex polynomials provide a large variety of harmonic functions. The simplest case
is
x
y
1
z
z
=
=
= 2
i 2
.
(15.12)
2
2
z
zz
|z|
x +y
x + y2
Its real and imaginary parts are graphed in Figure 15.1. Note that these functions have
an interesting singularity at the origin x = y = 0, and are harmonic everywhere else.

Graphing a complex function f : C C is problematic. The identification (15.3) of f with a


real vector-valued function f : R 2 R 2 implies that one requires four real dimensions to display
the complete graph.

3/7/03

642

c 2003

Peter J. Olver

Figure 15.1.

Real and Imaginary Parts of f (z) =

1
.
z

A slightly more complicated example is the useful function


f (z) =

z1
.
z+1

(15.13)

To write out (15.13) in real form, we multiply both numerator and denominator by the
complex conjugate of the denominator, leading to
(z 1)(z + 1)
| z |2 1 + z z
x2 + y 2 1
2y
z1
=
=
=
+
i
.
f (z) =
z+1
(z + 1)(z + 1)
| z + 1 |2
(x + 1)2 + y 2
(x + 1)2 + y 2
(15.14)
This manipulation can always be used to find the real and imaginary parts of general
rational functions.
If we assume that the rational function (15.11) is written in lowest terms, so p and q
have no common factors, then f (z) will have a singularity, known as a pole, wherever the
denominator vanishes: q(z0 ) = 0. The order of the root z0 of q(z) tells us the order of
the pole of f (z). For example, the rational function
f (z) =

z+2
z+2
=
5
3
z +z
(z + i )(z i )z 3

has three poles: a simple (of order 1) pole at z = + i , another simple pole at z = i and
a triple (order 3) pole at z = 0.
(c) Complex Exponentials: Eulers formula
ez = ex cos y + i ex sin y

(15.15)

for the complex exponential, cf. (3.74), yields two important harmonic functions: e x cos y
and ex sin y, which are graphed in Figure 3.7. More generally, writing out e c z for a complex

Recall that the order of a root z0 of a polynomial q(z) is the number times z z0 occurs as
a factor of q(z).

3/7/03

643

c 2003

Peter J. Olver

Figure 15.2.

Real and Imaginary parts of log z = log r + i .

constant c = a + i b produces the general complex exponential function


ec z = ea xb y cos(b x + a y) + i ea xb y sin(b x + a y).

(15.16)

Its real and imaginary parts are harmonic for arbitrary a, b R. We already encountered
some of these solutions to the Laplace equation when we used the separation of variables
method in Cartesian coordinates; see the table in Section 14.2.
(d) Complex Trigonometric Functions: The complex trigonometric functions are defined in terms of the complex exponential by adapting our earlier formulae (3.76):
e i z + e i z
= cos x cosh y i sin x sinh y,
2
e i z e i z
= sin x cosh y + i cos x sinh y.
sin z =
2i

cos z =

(15.17)

The resulting harmonic functions are products of trigonometric and hyperbolic functions.
They can all be written as linear combinations of the harmonic functions (15.16) derived
from the complex exponential. Note that when z = x is real, so y = 0, these functions
reduce to the usual real trigonometric functions cos x and sin x.
(e) Figure 15.2. Complex Logarithm: In a similar fashion, the complex (natural)
logarithm log z is a complex extension of the usual real natural (i.e., base e) logarithm. In
terms of polar coordinates (15.10), the complex logarithm has the form
log z = log(r e i ) = log r + log e i = log r + i ,

(15.18)

Thus, the logarithm of a complex number has real part


Re log z = log r =

1
2

log(x2 + y 2 ),

which is a well-defined harmonic function on all of R 2 except at the origin x = y = 0, where


it has a logarithmic singularity. It is, in fact, the logarithmic potential corresponding to a
3/7/03

644

c 2003

Peter J. Olver

delta function forcing concentrated at the origin that played a key role in the construction
of the Greens function for the Poisson equation in Section 14.3.
The imaginary part
y
Im log z = = ph z = tan1
x
of the complex logarithm is the phase or polar angle of z. The phase is also not defined
at the origin x = y = 0. Moreover, it is a multiply-valued harmonic function elsewhere,
since it is only specified up to integer multiples of 2 . Thus, a given nonzero complex
number z 6= 0 has an infinite number of possible values for its phase, and hence an infinite
number of possible complex logarithms log z, each differing by an integer multiple of 2 i ,
reflecting the fact that e2 i = 1. In particular, if z = x > 0 is real and positive, then
log z = log x agrees with the real logarithm, provided we choose the angle ph z = 0.
Alternative choices for the phase include a multiple of 2 i , and so ordinary real, positive
numbers x > 0 also have complex logarithms! On the other hand, if z = x < 0 is real
and negative, then log z = log | x | + (2 k + 1) i is complex no matter which value of ph z
is chosen. (This explains why we didnt attempt to define the logarithm of a negative
number in first year calculus!) In general, as we circle around the origin in a counterclockwise direction, Im log z = ph z = increases by 2 , and so its graph can be likened
to an infinitely tall parking ramp with infinitely many levels, spiraling upwards as one
goes around the origin, as sketched in Figure 15.2. For the complex logarithm, the origin
is a type of singularity known as a logarithmic branch point, indicating that there are an
infinite number of possible branches meaning values that can be assigned to log z at any
nonzero point.
Although the complex logarithm log z not a single-valued complex function on all of
C \ {0}, it can be continuously and unambiguously defined when restricted to any simply
connected domain C\{0} that does not include the origin. Essentially, the specification
of logarithm amounts to an unambiguous choice of level of our parking ramp sitting over
the domain . For instance, if we restrict our attention to points in the domain

= C \ x = Re z 0, y = Im z = 0 = < ph z <

obtained by cutting the complex plane along the negative real axis, then we can uniquely
specify an angle by requiring that it lie between and . This in turn produces a unique,
continuous specification of log z for all z . However, other choices are possible, and,
indeed, may be required for a given application.
(f ) Roots and Fractional Powers: A similar branching phenomenon occurs with the
fractionalpowers and roots of complex numbers. The simplest case is the square root
function
nonzero complex number z 6= 0 has two different possible square
z. Every
roots: z and z. As illustrated in Figure 15.3, the two square roots lie on opposite
sides of the origin, and are obtained by multiplying by 1. Writing z = r e i in polar
coordinates, we see that

3/7/03

i /2

z = re = re
= r cos + i sin
,
2
2
645

c 2003

(15.19)
Peter J. Olver

!!
z

Figure 15.3.

!!
z

Square Roots of a Complex Number.

i.e., we take the square root of the modulus and halve the phase:
p

z = |z| = r,
ph z = 21 ph z =

1
2

since is only defined up to an integer multiple of 2 , the angle 12 is only defined up to an


integer multiple of . The odd and even multiples yield different values for (15.19), which
accounts for the two possible values of the square root. For example, since ph 4 i = 21 or
5
2 , we find

i
i
i /4
4i = 2 i = 2e
= 2 cos
2+ i 2 .
=
+ i sin
4
4

If we
start at some z 6= 0 and circle once around the origin, we increase ph z by 2 ,
but ph z only increases
by . Thus, at the end of our circumambulation, we arrive at the
other square root z. Circling the originagain increases ph z by a further 2 , and hence
brings us back to the original square root z. Therefore, the graph of the multiply-valued
square root function will look like a weirdly interconnected parking ramp with only two
levels, as in Figure 15.4.
Similar remarks apply to the nth root

i /n
n
n
n
z= re
= r cos + i sin
,
(15.20)
n
n
which, except for z = 0, has n possible values, depending upon which multiple of 2 is
used in the assignment of ph z = . The n different nth roots are obtained by multiplying
any one of them by the different nth roots of unity, nk = e2 k i /n for k = 0, . . . , n 1, as

These graphs are best appreciated in a fully functional three-dimensional graphics viewer.

3/7/03

646

c 2003

Peter J. Olver

Figure 15.4.

Real and Imaginary Parts of

z.

defined in (12.12). In this case, the origin z = 0


is called a branch point of order n since
there are n different branches for the function n z. Circling around the origin
leads to

n
successive branches, returning after circling n times, to the original branch of z.
The preceding list of examples is far from exhausting the range and variety of complex
functions. Lack of space will preclude us from studying the remarkable properties of
complex versions of the gamma function, Airy functions, Bessel functions, and Legendre
functions that appear in Appendix C, as well as elliptic functions, the Riemann zeta
function, modular functions, and many, many other important and fascinating functions
arising in complex analysis and its manifold applications; see [118, 125].

15.2. Complex Differentiation.


Complex function theory is founded upon the notion of the complex derivative. Complex differentiation is defined in a direct analogy with the usual calculus limit definition of
the derivative of a real function. Yet, despite a superficial similarity, the resulting theory of
complex differentiation is profoundly different, and has an elegance and depth not shared
by its real progenitor.
Definition 15.2. A complex function f (z) is differentiable at a point z C if and
only if the limiting difference quotient exists:
f 0 (z) = lim

wz

f (w) f (z)
.
wz

(15.21)

The key feature of this definition is that the limiting value f 0 (z) of the difference
quotient must be independent of how the point w converges to z. On the real line, there
are only two basic directions to approach a limiting point either from the left or from
the right. These lead to the concepts of left and right handed derivatives and their equality
3/7/03

647

c 2003

Peter J. Olver

z+ik

Figure 15.5.

z+h

Complex Derivative Directions.

is required for the existence of the usual derivative of a real function. In the complex plane,
there are an infinite variety of directions in which one can approch the point z, and the
definition requires that all of these directional derivatives must agree. This is the reason
for the more severe restrictions on complex derivatives, and, in consequence, the source of
their remarkable properties.
Let us see what happens when we approach z along the two simplest directions
horizontal and vertical. If we set
w = z + h = (x + h) + i y,

where h is real,

then w z along a horizontal line as h 0, as in Figure 15.5. If we write out


f (z) = u(x, y) + i v(x, y)
in terms of its real and imaginary parts, then we must have
f (z + h) f (z)
f (x + h + i y) f (x + i y)
= lim
h0
h0
h
h

v(x + h, y) v(x, y)
u
v
f
u(x + h, y) u(x, y)
= lim
+i
=
+i
=
,
h0
h
h
x
x
x

f 0 (z) = lim

which follows from the usual definition of the (real) partial derivative. On the other hand,
if we set
w = z + i k = x + i (y + k),
where k is real,

Not to mention other approaches along parabolas, spirals, etc., although, as it turns out,
these more exotic routes do not lead to any further restrictions on the function.

3/7/03

648

c 2003

Peter J. Olver

then w z along a vertical line as k 0. Therefore, we must also have


f (x + i (y + k)) f (x + i y)
f (z + i k) f (z)
= lim i
k0
k0
ik
k

v(x, y + k) v(x, y)
u(x, y + k) u(x, y)
u
f
v
= lim
i
i
= i
.
=
h0
k
k
y
y
y

f 0 (z) = lim

When we equate the real and imaginary parts of these two distinct formulae for the complex
derivative f 0 (z), we discover that the real and imaginary components of f (z) must satisfy a
certain homogeneous linear system of partial differential equations, named after Augustin
Louis Cauchy and Bernhard Riemann, two of the principal founders of modern complex
analysis.
Theorem 15.3. A function f (z) has a complex derivative f 0 (z) if and only if its
real and imaginary parts are continuously differentiable and satisfy the CauchyRiemann
equations
v
u
v
u
=
,
=
.
(15.22)
x
y
y
x
In this case, the complex derivative of f (z) is equal to any of the following expressions:
f 0 (z) =

u
v
f
v
u
f
=
+i
= i
=
i
.
x
x
x
y
y
y

(15.23)

The proof of the converse that any function whose real and imaginary components
satisfy the CauchyRiemann equations is differentiable will be omitted, but can be
found in any basic text on complex analysis, e.g., [4, 101].
Remark : It is worth pointing out that equation (15.23) tells us that f satisfies f /x =
i f /y, which, reassuringly, agrees with the first equation in (15.7).
Example 15.4. Consider the elementary function
z 3 = (x3 3 x y 2 ) + i (3 x2 y y 3 ).

Its real part u = x3 3 x y 2 and imaginary part v = 3 x2 y y 3 satisfy the CauchyRiemann


equations (15.22), namely
u
v
= 3 x2 3 y 2 =
,
x
y

u
v
= 6xy =
.
y
x

This proves that f (z) = z 3 is complex differentiable. Not surprisingly, its derivative turns
out to be
f 0 (z) = 3 z 2 = (3 x2 3 y 2 ) + i (6 x y) =

v
v
u
u
+i
=
i
.
x
x
y
y

Fortunately, the complex derivative obeys all of the usual rules that you learned in
real-variable calculus. For example,
d n
z = n z n1 ,
dz
3/7/03

d cz
e = c ec z ,
dz
649

d
1
log z = ,
dz
z
c 2003

(15.24)
Peter J. Olver

and so on. The power n can even be non-integral or, in view of the identity z n = en log z ,
complex, while c is any complex constant. The exponential formulae (15.17) for the complex trigonometric functions implies that they also satisfy the standard rules
d
cos z = sin z,
dz

d
sin z = cos z.
dz

(15.25)

The formulae for differentiating sums, products, ratios, inverses, and compositions of complex functions are all the same as their real counterparts. Thus, thankfully, you dont need
to learn any new rules for performing complex differentiation!
There are many examples of quite reasonable functions which do not have a complex
derivative. The simplest is the complex conjugate function
f (z) = z = x i y.
Its real and imaginary parts do not satisfy the CauchyRiemann equations, and hence z
does not have a complex derivative. More generally, any function f (x, y) = h(z, z) that
explicitly depends on the complex conjugate variable z is not complex-differentiable.
Power Series and Analyticity
The most remarkable feature of complex analysis, which completely distinguishes it
from real function theory, is that the existence of one complex derivative automatically
implies the existence of infinitely many! All complex functions f (z) are infinitely differentiable and, in fact, analytic where defined. The reason for this surprising and profound
fact will, however, not become evident until we learn the basics of complex integration in
Section 15.5. In this section, we shall take analyticity as a given, and investigate some of
its principal consequnces.
Definition 15.5. A complex function f (z) is called analytic at a point z0 if it has a
power series expansion
2

f (z) = a0 + a1 (z z0 ) + a2 (z z0 ) + a3 (z z0 ) + =

n=0

an (z z0 )n , (15.26)

which converges for all z sufficiently close to z0 .


Typically, the standard ratio or root tests for convergence of (real) series that you
learned in ordinary calculus, [9, 114], can be applied to determine where a given (complex)
power series converges. We note that if f (z) and g(z) are analytic at a point z 0 , so is their
sum f (z) + g(z), product f (z) g(z) and, provided g(z0 ) 6= 0, ratio f (z)/g(z).
Example 15.6. All of the real power series from elementary calculus carry over to
the complex versions of the standard functions. For example,

X
zn
1 2 1 3
e = 1 + z + z + z + =
2
6
n!
n=0
z

3/7/03

650

(15.27)
c 2003

Peter J. Olver

is the Taylor series for the exponential function based at z0 = 0. A simple application
of the ratio test proves that the series converges for all z. On the other hand, the power
series

X
1
2
4
6
(1)k z 2 k ,
(15.28)
= 1 z + z z + =
z2 + 1
k=0

converges inside the unit disk, where | z | < 1, and diverges outside, where | z | > 1. Again,
convergence is established through the ratio test. The ratio test is inconclusive when
| z | = 1, and we shall leave the much harder question of precisely where on the unit disk
this complex series converges to a more advanced text, e.g., [4].
In general, there are three possible options for the domain of convergence of a complex
power series (15.26):
(a) The series converges for all z.
(b) The series converges inside a disk | z z0 | < of radius > 0 centered at z0 and
diverges for all | z z0 | > outside the disk. The series may converge at some
(but not all) of the points on the boundary of the disk where | z z0 | = .
(c) The series only converges, trivially, at z = z0 .
The number is known as the radius of convergence of the series. In case (a), we say
= , while in case (c), = 0, and the series
not represent an analytic function. An
P does
example when = 0 is the power series
n! z n . In the intermediate case, determining
precisely where on the boundary of the convergence disk the power series converges is quite
delicate, and will not be pursued here. The proof of this result can be found in Exercise
. See [4, 63] for further details.
Remarkably, the radius of convergence for the power series of a known analytic function
f (z) can be determined by inspection, without recourse to any fancy convergence tests!
Namely, is equal to the distance from z0 to the nearest singularity of f (z), meaning a
point where the function fails to be analytic. This explains why the Taylor series of e z
converges everywhere, while that of (z 2 + 1)1 only converges inside the unit disk. Indeed
ez is analytic for all z and has no singularities; therefore the radius of convergence of its
power series centered at any point z0 is equal to = . On the other hand, the
function
1
1
=
f (z) = 2
z +1
(z + i )(z i )
has singularities (poles) at z = i , and so the series (15.28) has radius of convergence
= 1, which is the distance from z0 = 0 to the singularities. Therefore, the extension of
the theory of power series to the complex plane serves to explain the apparent mystery of
why, as a real function, (1 + x2 )1 is well-defined and analytic for all real x, but its power
series only converges on the interval ( 1, 1 ). It is the complex singularities that prevent
its convergence when | x | > 1! If we expand (z 2 + 1)1 in a power series at some other
point, say z0 = 1 + 2 i , then
is closest. Wecompute
we need to determine which singularity
| i z0 | = | 1 i | = 2, while | i z0 | = | 1 3 i | = 10, and so = 2 is the
smaller of these two numbers. Thus we can determine the radius of convergence without
any explicit formula for its (rather complicated) Taylor expansion at z 0 = 1 + 2 i .
3/7/03

651

c 2003

Peter J. Olver

There are, in fact, only three possible types of singularities of a complex function f (z):
(i ) Pole. A singular point z = z0 is called a pole of order n > 0 if and only if the
function
h(z) = (z z0 )n f (z)
(15.29)
(ii )

(iii )

is analytic and nonzero, h(z0 ) 6= 0, at z = z0 . The simplest example of such a


function is f (z) = a (z z0 )n for a 6= 0 a complex constant.
Branch point. We have alreadyencountered the two basic types: algebraic branch
points, such as the function n z at z0 = 0, and logarithmic branch points such as
log z at z0 = 0. The degree of the branch point is n in the first case and in the
second.
Essential singularity. By definition, a singularity is essential if it is not a pole or a
branch point. The simplest example is the essential singularity at z 0 = 0 of the
function e1/z . Details are left as an Exercise .

Example 15.7. For example, the function


ez
z3 z2 5 z 3
has a simple (order 1) pole at z = 3 and a double (order 2) pole at z = 1. Indeed,
factorizing the denominator z 3 z 2 5 z 3 = (z + 1)2 (z 3), we see that the functions
f (z) =

ez
ez
2
,
h
(z)
=
(z
+
1)
f
(z)
=
,
2
(z + 1)2
z3
are analytic and non-zero at, respectively, z = 3 and z = 1.
h1 (z) = (z 3) f (z) =

A complex function can have a variety of singularities. For example, the function

3
z + 2 e 1/z
f (z) =
z2 + 1
has simple poles at z = i , a branch point of degree 3 at z = 2 and an essential
singularity at z = 0.
Differentiation
As in the real case, and unlike Fourier series, convergent power series can always be
repeatedly term-wise differentiated. Therefore, given the convergent series (15.26), we have
the corresponding series
2

f (z) = a1 + 2 a2 (z z0 ) + 3 a3 (z z0 ) + 4 a4 (z z0 ) + =
2

00

n=0

(n + 1) an+1 (z z0 )n ,

f (z) = 2 a2 + 6 a3 (z z0 ) + 12 a4 (z z0 ) + 20 a5 (z z0 ) +

X
(n + 1)(n + 2) an+2 (z z0 )n ,
=

(15.30)

n=0

and so on, for its derivatives. The proof that the differentiated series have the same radius
of convergence can be found in [4, 101, series]. As a consequence, we deduce the following
important result.
3/7/03

652

c 2003

Peter J. Olver

Theorem 15.8. Any analytic function is infinitely differentiable.


In particular, when we substitute z = z0 into the successively differentiated series, we
discover that
a1 = f 0 (z0 ),

a0 = f (z0 ),

a2 =

1
2

f 00 (z0 ),

and, in general,
f (n) (z)
.
(15.31)
n!
Therefore, a convergent power series (15.26) is, inevitably, the usual Taylor series
an =

X
f (n) (z0 )
f (z) =
(z z0 )n ,
n!
n=0

(15.32)

for the function f (z) at the point z0 .


Let us conclude this section by summarizing the fundamental theorem that characterizes complex functions. A complete, rigorous proof relies on complex integration theory,
which is the topic of Section 15.5.
Theorem 15.9. Let C be an open set. The following properties are equivalent:
(a) The function f (z) has a continuous complex derivative f 0 (z) for all z .
(b) The real and imaginary parts of f (z) have continuous partial derivatives and satisfy
the CauchyRiemann equations (15.22) in .
(c) The function f (z) is analytic for all z , and so is infinitely differentiable and has a
convergent power series expansion at each point z0 . The radius of convergence
is at least as large as the distance from z0 to the boundary . See Figure rc .
Any function that satisfies the conditions of Theorem 15.9 will be referred to as a
complex function. Sometimes one of the equivalent adjectives analytic or holomorphic,
is added for emphasis. From now on, all complex functions are assumed to be analytic
everywhere on their domain of definition, except, possibly, at certain isolated singularities.

15.3. Harmonic Functions.


We began this chapter by motivating the analysis of complex functions through applications to the solution of the two-dimensional Laplace equation. Let us now formalize
the precise relationship between the two.
Theorem 15.10. If f (z) = u(x, y) + i v(x, y) is any complex analytic function, then
its real and imaginary parts, u(x, y), v(x, y), are both harmonic functions.
Proof : Differentiating the CauchyRiemann equations (15.22), and invoking the
equality of mixed partial derivatives, we find that


2u
u
v
2v
v

u
2u
=
=
=
=
=

.
x2
x x
x y
x y
y x
y
y
y 2

Theorem 15.9 allows us to differentiate u and v as often as desired.

3/7/03

653

c 2003

Peter J. Olver

Therefore, u is a solution to the Laplace equation uxx + uyy = 0. The proof for v is
similar.
Q.E.D.
Thus, every complex function f = u+ i v gives rise to two harmonic functions. It is, of
course, of interest to know whether we can invert this procedure. Given a harmonic function
u(x, y), does there exist a harmonic function v(x, y) such that f = u + i v is a complex
analytic function? If so, the harmonic function v(x, y) is known as a harmonic conjugate
to u. The harmonic conjugate is found by solving the CauchyRiemann equations
v
u
=
,
x
y

v
u
=
,
y
x

(15.33)

which, for a prescribed function u(x, y), constitutes an inhomogeneous linear system of
partial differential equations for v(x, y). As such, it is usually not hard to solve, as the
following example illustrates.
Example 15.11. As the reader can verify, the harmonic polynomial
u(x, y) = x3 3 x2 y 3 x y 2 + y 3
satisfies the Laplace equation everywhere. To find a harmonic conjugate, we solve the
CauchyRiemann equations (15.33). First of all,
v
u
=
= 3 x2 + 6 x y 3 y 2 ,
x
y
and hence, by direct integration with respect to x,
v(x, y) = x3 + 3 x2 y 3 x y 2 + h(y),
where h(y) the constant of integration is a function of y alone. To determine h we
substitute our formula into the second CauchyRiemann equation:
3 x2 6 x y + h0 (y) =

u
v
=
= 3 x2 6 x y 3 y 2 .
y
x

Therefore, h0 (y) = 3 y 2 , and so h(y) = y 3 + c, where c is a real constant. We conclude


that every harmonic conjugate to u(x, y) has the form
v(x, y) = x3 + 3 x2 y 3 x y 2 y 3 + c.
Note that the corresponding complex function
u(x, y) + i v(x, y) = (x3 3 x2 y 3 x y 2 + y 3 ) + i (x3 + 3 x2 y 3 x y 2 y 3 + c)
= (1 i )z 3 + c

is a particular complex cubic polynomial.


Remark : On a connected domain, all harmonic conjugates to a given function u(x, y)
only differ by a constant: ve(x, y) = v(x, y) + c; see Exercise .
3/7/03

654

c 2003

Peter J. Olver

Although most harmonic functions have harmonic conjugates, unfortunately this is


not always the case. Interestingly, the existence or non-existence of a harmonic conjugate
can depend on the underlying geometry of the domain of definition of the function. If
the domain is simply-connected, and so contains no holes, then one can always find a
harmonic conjugate. In fact, this is an immediate consequence of our characterization of
potential functions in Chapter A. Otherwise, if the domain of definition of our harmonic
function u(x, y) is not simply-connected, then there may not exist a single-valued harmonic
conjugate v(x, y) to serve as the imaginary part of a complex function f (z).
Example 15.12. The simplest example where the latter possibility occurs is the
logarithmic potential
u(x, y) = log r = 21 log(x2 + y 2 ).
This function is harmonic on the non-simply-connected domain = C \ {0}, but it is
not the real part of any single-valued complex function. Indeed, according to (15.18), the
logarithmic potential is the real part of the multiply-valued complex logarithm log z, and
so its harmonic conjugate is ph z = , which cannot be consistently and continuously
defined on all of . On the other hand, it is possible to choose a continuous, single-valued
e and so
branch of the angle = ph z if z is restricted to a simply connected subdomain ,
e
log r does have a genuine harmonic conjugate on .
The harmonic function
x
u(x, y) = 2
x + y2
is also defined on the same non-simply-connected domain = C \ {0} with a singularity
at x = y = 0. In this case, there is a single valued harmonic conjugate, namely
y
v(x, y) = 2
,
x + y2
which is defined on all of . Indeed, according to (15.12), these functions define the real
and imaginary parts of the complex function u + i v = 1/z. Alternatively, one can directly
check that they satisfy the CauchyRiemann equations (15.22).
Remark : On the punctured plane = C \ {0}, the logarithmic potential is, in a
sense, the only counterexample that prevents a harmonic conjugate from being constructed.
It can be shown,
[XC], that

if u(x, y) is a harmonic function defined on a punctured


disk R = 0 < | z | < R , where 0 < R , then there exists a constant c such
that u
e(x, y) = u(x, y) c log r is also harmonic and possess a single-valued harmonic
conjugate ve(x, y). As a result, the function fe = u
e + i ve is analytic on all of R , and
so our original function u(x, y) is the real part of the multiply-valued analytic function
f (z) = fe(z) + c log z. We shall use this fact in our later analysis of airfoils.

Theorem 15.13. Every harmonic function u(x, y) defined on a simply-connected


domain is the real part of a complex valued function f (z) = u(x, y) + i v(x, y) which is
defined for all z = x + i y .

We can, by a previous remark, add in any constant to the harmonic conjugate, but this does
not affect the subsequent argument.

3/7/03

655

c 2003

Peter J. Olver

Proof : We first rewrite the CauchyRiemann equations (15.33) in vectorial form as


an equation for the gradient of v:

uy

v = u ,
where
u =
(15.34)
ux
is the vector field that is everywhere orthogonal to the gradient of u and of the same
length :
u u = 0,
k u k = k u k.

Thus, we have established the important observation that the gradient of a harmonic
function and that of its harmonic conjugate are mutually orthogonal vector fields:
v u 0.

(15.35)

Now, according to Theorem A.8, provided we work on a simply-connected domain,


the gradient equation

f1
v = f =
f2

has a solution if and only if the vector field f satisfies the curl-free constraint
f =

f
f2
1 0.
x
y

IN our specific case, the curl of the perpendicular vector field u coincides with the
divergence of u itself, which, in turn, coincides with the Laplacian:

u
2u 2u

u = u = u = 0, i.e.,

=
+ 2 = 0.
x x
y
y
x2
y
The result is zero because we are assuming that u is harmonic. Equation (21.19) permits us
to reconstruct the harmonic conjugate v(x, y) from its gradient v through line integration
Z
Z
Z

v(x, y) =
v dx =
u dx =
u n ds,
(15.36)
C

where C is any curve connecting a fixed point (x0 , y0 ) to (x, y). Therefore, the harmonic conjugate to a given potential function u can be obtained by evaluating its (pathindependent) flux integral (15.36).
Q.E.D.
Remark : As a consequence of (15.23) and the CauchyRiemann equations (15.33),
f 0 (z) =

u
u
v
v
i
=
+i
.
x
y
y
x

(15.37)

Thus, the components of the gradients u and v appear as the real and imaginary parts
of the complex derivative f 0 (z).
Since we are working in R 2 , these properties along with the right hand rule serve to uniquely
characterize u .

3/7/03

656

c 2003

Peter J. Olver

Figure 15.6.

Level Curves of the Real and Imaginary Parts of z 2 and z 3 .

The orthogonality (15.34) of the gradient of a function and of its harmonic conjugate has the following important geometric consequence. Recall, Theorem A.14, that the
gradient u of a function points in the normal direction to its level curves { u(x, y) = c }.
Since v is orthogonal to u, this must mean that v is tangent to the level curves of u.
Vice versa, v is normal to its level curves, and so u is tangent to the level curves of its
harmonic conjugate v. Since their tangent directions u and v are orthogonal, the level
curves of the real and imaginary parts of a complex function form a mutually orthogonal
system of plane curves but with one key exception. If we are at a critical point, where
u = 0, then v = u = 0, and the vectors do not define tangent directions. Therefore,
the orthogonality of the level curves does not necessarily hold at critical points. It is worth
pointing out that, in view of (15.37), the critical points of u are the same as those of v
and also the same as the critical points of the corresponding complex function f (z), i.e.,
where its complex derivative vanishes: f 0 (z) = 0.
In Figure 15.6, we illustrate the preceding discussion by plotting the level curves of
the real and imaginary parts of the monomials z 2 and z 3 . Note that, except at the origin,
where the derivative vanishes, the level curves intersect everywhere at right angles.
Applications to Fluid Mechanics
Consider a planar steady state fluid flow, with velocity vector field

u(x, y)
at the point x = (x, y) .
v(x) =
v(x, y)

Here R 2 is the domain occupied by the fluid, while the vector v(x) represents the
instantaneous velocity of the fluid at the point x. In many physical situations, the flow of

See the remarks in Chapter A on the interpretation of a planar fluid flow as the cross-section
of a fully three-dimensional fluid motion that does not depend upon the vertical coordinate.

3/7/03

657

c 2003

Peter J. Olver

liquids (and, although less often, gases) is both incompressible and irrotational, which for
short, are known as ideal fluid flows. Recall that the flow is incompressible if and only if
it has vanishing divergence:
u v
v =
+
= 0.
(15.38)
x y
On the other hand, the flow is irrotational if and only if it has vanishing curl:
v =

v
u

= 0.
x y

(15.39)

The two constraints (15.38), (15.39) are almost identical to the CauchyRiemann equations
(15.22)! The only difference is the sign in front of the derivatives of v, but this can be
easily remedied by replacing v by its negative v. As a result, we deduce the connection
between ideal planar fluid flows and complex functions.
Theorem 15.14. The vector field v = ( u(x, y), v(x, y) )
an ideal fluid flow if and only if

is the velocity vector of

f (z) = u(x, y) i v(x, y)

(15.40)

is a complex analytic function of z = x + i y.


Therefore, the components u(x, y) and v(x, y) of the velocity vector field for an
ideal fluid are harmonic conjugates. The complex function (15.40) is known as the complex
velocity of the fluid flow. When using this result, do not forget the minus sign that appears
in front of the imaginary part of f (z).
As in Example A.7, the fluid particles will follow the curves z(t) = x(t) + i y(t)
obtained by integrating the differential equations
dy
dx
= u(x, y),
= v(x, y),
dt
dt
which, in view of (15.40), we can rewrite in complex form

(15.41)

dz
(15.42)
= f (z) .
dt
Each fluid particles motion z(t) is uniquely prescribed by its initial position z(0) = z 0 =
x0 + i y0 at time t = 0. The curves parametrized by z(t) are the paths followed by the
particles, i.e., the streamlines of the flow. In particular, if the complex velocity vanishes,
f (z0 ) = 0, then the solution z(t) z0 to (15.42) is constant, and hence z0 is a stagnation
point of the flow.
Example 15.15. The simplest example is when the velocity is constant, corresponing to a uniform steady flow. Consider first the case
f (z) = 1,

See below for more details on complex curves.

3/7/03

658

c 2003

Peter J. Olver

which corresponds to the horizontal velocity vector field v = ( 1, 0 ) . The actual fluid flow
is found by integrating the system

z = 1,

x = 1,

or

y = 0.

Thus, the solution z(t) = t + z0 represents a uniform horizontal fluid motion whose streamlines are straight lines parallel to the real axis; see Figure flows .
Consider next a more general constant velocity
f (z) = c = a + i b.
The fluid particles will solve the ordinary differential equation

z = c = a i b,

so that

z(t) = c t + z0 .

The streamlines remain parallel straight lines, but now at an angle = ph c = ph c


with the horizontal; see Figure laminar . The fluid particles move along the streamlines
at constant speed | c | = | c |.
The next simplest complex velocity function is
f (z) = z = x + i y.

(15.43)

The corresponding fluid flow is found by integrating the system

z = z,

or, in real form,

x = x,

y = y.

The origin x = y = 0 is a stagnation point. The trajectories of the nonstationary solutions


z(t) = x0 et + i y0 et

(15.44)

are the hyperbolas x y = c and the positive and negative semi-axes, as illustrated in
Figure flows .
On the other hand, if we choose
then the flow is the solution to

z = i z,

f (z) = i z = y i x,
or, in real form,

x = y,

y = x.

The solutions
z(t) = (x0 cosh t + y0 sinh t) + i (x0 sinh t + y0 cosh t),
move along the hyperbolas (and rays) x2 y 2 = c2 . Thus, this flow is obtained by rotating
the preceding example by 45 .
Example 15.16. A solid object in a fluid flow is characterized by the no-flux condition that the fluid velocity v is everywhere tangent to the boundary, and hence no fluid
flows into or out of the object. As a result, the boundary will consist of streamlines and
stagnation points of the idealized fluid flow. For example, the boundary of the upper right
quadrant Q = { x > 0, y > 0 } C consists of the positive x and y axes (along with the
origin). Since these are streamlines of the flow with complex velocity (15.43), its restriction
to Q represents the flow past a 90 interior corner, which appears in Figure corner . The
fluid particles move along hyperbolas as they flow past the corner.
3/7/03

659

c 2003

Peter J. Olver

Remark : We could also restrict this flow to the domain = C\{ x < 0, y < 0 } consisting of three quadrants, and corresponding to a 90 exterior corner. However, the restricted
flow is not as relevant in this case since it does not have a physically realizable asymptotic behavior at large distances. See Exercise for the correct physical flow around an
exterior corner.
Now, suppose that the complex velocity f (z) admits a complex anti-derivative, i.e., a
complex analytic function
(z) = (x, y) + i (x, y)

that satisfies

d
= f (z).
dz

(15.45)

Using the formula (15.23) for the complex derivative, we see that

d
=
i
= u i v,
dz
x
y

so

= u,
x

= v.
y

Thus, = v, and hence the real part (x, y) of the complex function (z) defines a
velocity potential for the fluid flow. For this reason, the anti-derivative (15.45) is known
as the complex potential function for the given fluid velocity field.
Since the complex potential is analytic, its real part, the potential function, is harmonic and therefore satisfies the Laplace equation = 0. Conversely, any harmonic
function can be viewed as the potential function for some fluid flow. The real fluid velocity
is its gradient v = . The harmonic conjugate (x, y) to the velocity potential also plays
an important role, and, in fluid mechanics, is known as the stream function for the fluid
flow. It also satisfies the Laplace equation = 0, and the potential and stream function
are related by the CauchyRiemann equations (15.22).
The level curves of the velocity potential, (x, y) = c, are known as equipotential
curves for the flow. The velocity vector v = points in the normal direction to the
equipotentials. On the other hand, as we noted above, v = is tangent to the level
curves (x, y) = d of its harmonic conjugate stream function. But v is the velocity field,
and so tangent to the streamlines followed by the fluid particles. Thus, these two systems
of curves must coincide, and we infer that the level curves of the stream function are the
streamlines of the flow , whence the name stream function! Summarizing, for an ideal
fluid flow, the equipotentials { = c } and streamlines { = d } form mutually orthogonal
systems of plane curves. The fluid velocity v = is tangent to the stream lines and
normal to the equipotentials, whereas the gradient of the stream function is tangent
to the equipotentials and normal to the streamlines.
The discussion in the preceding paragraph implicitly relied on the fact that the velocity
is nonzero, v = 6= 0, which means we are not at a stagnation point, where the fluid
is not moving. While streamlines and equipotentials might begin or end at a stagnation
point, there is no guarantee, and, indeed, in general it is not the case that they meet at
mutually orthogonal directions there.
Example 15.17. The simplest example of a complex potential function is
(z) = z = x + i y.
3/7/03

660

c 2003

Peter J. Olver

Figure 15.7.

1
Equipotentials and Streamlines for z + .
z

Thus, the velocity potential is (x, y) = x, while its harmonic conjugate stream function
is (x, y) = y. The complex derivative of the potential is the complex velocity,
d
= 1,
dz
which corresponds to the uniform horizontal fluid motion considered first in Example 15.15.
Note that the horizontal stream lines coincide with the level sets y = k of the stream
function, whereas the equipotentials = x = c are the orthogonal system of vertical lines.
Next, consider the complex potential function
f (z) =

(z) =

1
2

z2 =

1
2

The complex velocity function

(x2 y 2 ) + i x y.

f (z) = 0 (z) = z = x + i y
leads to the hyperbolic flow (15.44). The hyperbolic streamlines x y = d are the level
curves of the stream function (x, y) = x y. The equipotential lines 12 (x2 y 2 ) = c form a
system of orthogonal hyperbolas. A picture of the equipotentials and stream lines in this
particular case can be found in the first plot in Figure 15.6.
Example 15.18. Flow Around a Disk . Consider the complex potential function

x
1
y
(z) = z + = x + 2
+ i y 2
.
(15.46)
z
x + y2
x + y2

The corresponding complex fluid velocity is


f (z) =

1
x2 y 2
2xy
d
=1 2 =1 2
+ i 2
.
2
2
dz
z
(x + y )
(x + y 2 )2

(15.47)

The equipotential curves and streamlines are plotted in Figure 15.7. The points z = 1
are stagnation points of the flow, while z = 0 is a singularity. In particular, fluid particles
that move along the positive x axis approach the leading stagnation point z = 1, but take
an infinite amount of time to reach it. Note that at large distances, the streamlines
y
(x, y) = y 2
=d
x + y2
3/7/03

661

c 2003

Peter J. Olver

are asymptotically horizontal, and hence, far away from the origin, the flow is indistinguishable from uniform horizontal motion with complex velocity f (z) 1. The level curve
for the particular value d = 0 consists of the unit circle | z | = 1 and the real axis y = 0.
In particular, the unit circle | z | = 1 consists of two stream lines and the two stagnation
points. Therefore, the flow velocity vector field v = is everywhere tangent to the unit
circle, and hence satisfies the no flux condition on the boundary
of theunit disk. Thus,
we can interpret (15.47), when restricted to the domain = | z | > 1 , as the complex
velocity of a uniformly moving fluid around the outside of a solid circular disk of radius
1. In three dimensions, this would correspond to the steady flow of a fluid around a solid
cylinder.
In this section, we have focussed on the fluid mechanical roles of a harmonic function
and its conjugate. An analogous interpretation applies when (x, y) represents an electromagnetic potential function; the level curves of its harmonic conjugate (x, y) are the
paths followed by charged particles under the electromotive force field v = . Similarly,
if (x, y) represents the equilibrium temperature distribution in a planar domain, its level
lines represent the isotherms or curves of constant temperature, while the level lines of its
harmonic conjugate are the curves of heat flow, whose mutual orthogonality was already
noted in Chapter A. Finally, if (x, y) represents the height of a deformed membrane,
then its level curves are the contour lines of elevation. The level curves of its harmonic
conjugate are the curves of steepest descent along the membrane, i.e., the routes followed
by, say, water flowing down the membrane.

15.4. Conformal Mapping.


As we now know, complex functions provide an almost inexhaustible source of harmonic functions, i.e., solutions to the Laplace equation. Thus, to solve a boundary value
problem for Laplaces equation we merely need to find the right complex function whose
real part matches the prescribed boundary conditions. Unfortunately, even for relatively
simple domains, this is still not a particularly easy task. The one case where we do have
an explicit solution is that of a circular disk, where the Poisson integral formula (14.44)
provides a complete solution to the Dirichlet boundary value problem. (See Exercise
for the Neumann and mixed boundary value problems.) However, determining the corresponding integral formula or Greens function for a more complicated domain remains a
daunting task, even with the relatively powerful tools of complex analysis at our disposal.
There is, however, a wonderful idea that will go very far towards this general goal.
Given that we know how to solve a boundary value problem on one particular domain, the
unit disk

D = = + i 2 + 2 < 1 ,

perhaps we can make an inspired change of variables that will convert the unsolved boundary value problem on into one that we know how to solve on D. In other words, we seek
a pair of functions
= p(x, y),
= q(x, y),
(15.48)
that maps each point (x, y) to a point (, ) D in the unit disk, as illustrated in
Figure map . The desired mapping must satisfy fairly stringent requirements.
3/7/03

662

c 2003

Peter J. Olver

(a) First of all, it should be one-to-one, and so each point (x, y) maps to a unique
point in (, ) = (p(x, y), q(x, y)) D. Under these conditions, each function U (, )
defined on the unit disk will correspond to a unique function

u(x, y) = U p(x, y), q(x, y)


(15.49)
defined on the domain , whose value at the point (x, y) to equal the value of U at the
image point (, ) = (p(x, y), q(x, y)).
(b) Secondly, both the map (15.48) and its inverse
x = P (, ),

y = Q(, ),

(15.50)

should be sufficiently smooth so as to allow us to take derivatives of the functions u(x, y)


and U (, ). The Inverse Function Theorem, cf. [9], requires that the Jacobian determinant

(, )

y 6= 0
= det x
(15.51)

(x, y)
x y
be everywhere non-zero in the domain . Incidentally, the Jacobian condition is enough
to ensure that the map is locally (but not necessarily globally) one-to-one.
(c) Moreover, the map (15.48) should extend continuously
to
2
the boundary , map2
ping it to the boundary of the unit disk D = C = + = 1 , which is the unit circle.
This will ensure that a boundary value problem for u(x, y) on is mapped to a boundary
value problem for U (, ) on D.
(d) Finally, we must ensure that if U (, ) satisfies the Laplace equation
U = U + U = 0

on

D,

then u(x, y) as given by (15.49) will satisfy the Laplace equation


u = uxx + uyy = 0

on

Otherwise, the proposed mapping will be of scant help for solving the boundary value
problem under consideration. The latter requirement is, without extra insight, quite hard
to ensure.
Example 15.19. The scaling change of variables
= a x,
= by
(15.52)
2 2

changes the elliptical domain = a x + b2 y 2 < 1 to the unit disk D = { 2 + 2 < 1 }.


However, it is not of much help for solving the Laplace equation on the elliptical domain.
Indeed, when we relate a function U (, ) on D to
u(x, y) = U (a x, b y)
on , the partial derivatives are related by
2
2u
2 U
=
a
,
x2
2

3/7/03

663

2
2u
2 U
=
b
.
y 2
2
c 2003

Peter J. Olver

If U is harmonic, so U = U + U = 0, then u(x, y) satisfies the partial differential


equation
1 2u
1 2u
+
= 0.
(15.53)
a2 x2
b2 y 2
Unless a = b in which case the domain is a circle and we are performing a simple
scaling transformation the function u(x, y) is not a solution to the Laplace equation
on . Be that as it may, this change of variables does provide a means of solving the
Dirichlet boundary value problem for the elliptic partial differential equation (15.53) on
the elliptical domain .
Analytic Maps
The crucial insight that makes the change of variables idea so effective is that complex analytic functions not only provide harmonic functions as candidate solutions to the
Laplace equation, they also provide a large class of mappings that accomplish the desired
goals. The method rests on the simple fact that the composition of two complex analytic
functions is also complex analytic.
Lemma 15.20. If w = F () is an analytic function of the complex variable =
+ i and = g(z) is an analytic function of the complex variable z = x + i y, then the
composition w = f (z) F g(z) = F (g(z)) is an analytic function of z.
Proof : The proof that the composition of two differentiable functions is differentiable
is identical to the real variable version, [9, 114],and need not be reproduced here. The
derivative of the composition is explicitly given by the usual chain rule:
d
F g(z) = F 0 (g(z)) g 0 (z),
dz

or, in Leibnizian notation,

dw
dw d
=
.
dz
d dz

Q.E .D.

We interpret a complex function


= g(z)

or

+ i = p(x, y) + i q(x, y)

(15.54)

as a mapping, as in (15.48), that takes a point z = x + i y belonging to a prescribed


domain C to a point = + i D belonging to the image domain D = g() C.
Based on our earlier comments, we will make three important assumptions:
(a) The analytic mapping is one-to-one. In other words, we assume that each point
D comes from a unique point z , and so the inverse function z = g 1 () is a
well-defined map from D beack to .
(b) The inverse mapping g 1 () is analytic on all of D. Recall that the derivative of
the inverse function is given by
1
d 1
g () = 0
d
g (z)

at

= g(z).

(15.55)

Of course, to properly define the composition, we need to ensure that the range of the function
= g(z) is contained in the domain of the function w = f ().

3/7/03

664

c 2003

Peter J. Olver

This formula, which is equally valid for complex functions, implies that the derivative of
g(z) must be nonzero everywhere in order that g 1 () be differentiable. This condition
g 0 (z) 6= 0

at every point

z ,

(15.56)

will play a crucial role in the development of the method.


(c) The mapping extends continuously to the boundary and maps it to the boundary D of the image domain.
Before trying to apply these techniques to solve boundary value problems for the
Laplace equation, we consider some of the most important examples of analytic maps.
Example 15.21. The simplest nontrivial analytic maps are the translations
= z + c = (x + a) + i (y + b),

(15.57)

which translates the entire complex plane in the direction given by c = a + i b. These are
the complex counterparts of the affine translations (7.69) of a vector space. The effect is
to map a disk | z + c | < 1 of radius 1 and center at c to the unit disk | | < 1.
There are two types of linear analytic transformations. First, we have the scaling map
= z = x + i y,

(15.58)

where 6= 0 is a fixed nonzero real number. These map the disk | z | < 1/| | to the unit
disk | | < 1. Second are the rotations
= e i z = (x cos y sin ) + i (x sin + y cos )

(15.59)

around the origin by a fixed (real) angle . These map the unit disk to itself.
Any non-constant affine transformation
= z + ,

6= 0,

(15.60)


is also affine.

Writing = e i in polar coordinates, we see that the affine map (15.60) can be built up
from a translation, a scalaing and a rotation. As such, it takes the disk | z + | < 1 of
radisu 1/| | = 1/| | and center / to the unit disk | | < 1. As such, none of these
maps take us to a radically new class of boundary value problems.
defines an invertible analytic map on all of C, whose inverse map z =

Example 15.22. A more interesting complex function is


= g(z) =

1
,
z

or

x2

x
,
+ y2

x2

y
,
+ y2

(15.61)

which is known as an inversion of the complex plane. It defines a one-to-one analytic


map everywhere except at the origin z = 0; indeed g(z) is its own inverse: g 1 () = 1/.
Note that g 0 (z) = 1/z 2 is never zero, and so the derivative condition (15.56) is satisfied

This is slightly different than the real inversion (14.68); see Exercise .

3/7/03

665

c 2003

Peter J. Olver

everywhere. Thus, any domain C \ {0} will be mapped in a one-to-one manner onto
an image domain D = g() C \ {0}.

Note that | | = 1/| z |, while ph = ph z. Thus, if = | z | > denotes the


exterior of the circle of radius , then the image points
= 1/z satisfy
| | = 1/| z |, and
hence the image domain is the punctured disk D = 0 < | | < 1/ . In particular, the
inversion maps the outside of the unit disk to its inside, but with the origin removed, and
vice versa. The reader may enjoy seeing what the inversion does to other domains, e.g.,
the unit square.
Example 15.23. The complex exponential
= g(z) = ez ,

= ex cos y,

or

= ex sin y,

(15.62)

satisfies the condition g 0 (z) = ez 6= 0 everywhere. Nevertheless, it is not one-to-one because


ez+2 i = ez , and so all points differing by an integer multiple of 2 i are mapped to the
same point.
Under the exponential map (15.62), the horizontal line Im z = b is mapped to the
curve = ex+ i b = ex (cos b + i sin b), which, as x varies from to , traces out the ray
emanating from the origin that makes an angle ph =b with the realaxis. Therefore, the
exponential map will map a horizontal strip Sa,b = a < Im z < b to a wedge-shaped

domain a,b = a < ph < b , and is one-to-one provided | b a | < 2 . In particular,

the horizontal strip S /2,/2 = 12 < Im z < 12 of width centered around the
real axis is mapped, in a one-to-one manner, to the right half plane

R = /2,/2 = 21 < ph < 12 = Im > 0 ,

while the horizontal strip S , = < Im z < of width 2 is mapped onto the
domain

= , = < ph < = C \ { Im z = 0, Re z 0 }

obtained by cutting the complex plane along the negative real axis.
On the other hand, vertical lines Re z = a are mapped to circles | | = ea . Thus,
a vertical strip a < Re z < b is mapped to an annulus ea < | | < eb , albeit many-toone, since
the strip is effectively
wrapped around and around the annulus. The rectangle
R = a < x < b, < y < of height 2 is mapped in a one-to-one fashion on an
annulus that has been cut along the negative real axis. See Figure ezm for an illustration.
Example 15.24. The squaring map
= g(z) = z 2 ,

or

= x2 y 2 ,

= 2 x y,

(15.63)

is analytic
on all of C, but is not one-to-one. Its inverse is the square root function

z = , which, as discussed in Section 15.1, is double-valued. Furthermore, the derivative


g 0 (z) = 2 z vanishes at z = 0, violating the invertibility condition (15.56). However, once
we restrict to a simply connected subdomain that does not contain 0,
the function
g(z) = z 2 does define a one-to-one mapping, whose inverse z = g 1 () = is a welldefined, analytic and single-valued branch of the square root function.
3/7/03

666

c 2003

Peter J. Olver

The effect of the squaring map on a point z is to square its modulus, | | = | z | 2 , while
doubling its angle, ph = ph z 2 = 2 ph z. Thus, for example, the upper right quadrant

Q = x > 0, y > 0 = 0 < ph z < 12


is mapped by (15.63) onto the upper half plane

U = g(Q) = = Im > 0 = 0 < ph < .

The inverse function maps a point D back to its unique square root z =
in the quadrant Q. Similarly, a quarter disk

Q = 0 < | z | < , 0 < ph z < 21

that lies

of radius is mapped to a half disk

0 < | | < 2 , Im > 0

of radius 2 . On the other hand, the unit square = 0 < x < 1, 0 < y < 1 is mapped
to a certain curvilinear domain, as indicated in Figure z2m .
U2 = g() =

Example 15.25. A particularly important example is the analytic map


=

x2 + y 2 1 + 2 i y
z1
,
=
z+1
(x + 1)2 + y 2

(15.64)

where we used (15.14) to derive the formulae for its real and imaginary parts. The map is
one-to-one with analytic inverse
z=

1+
1 2 2 + 2 i
=
,
1
(1 )2 + 2

(15.65)

provided z 6= 1 and 6= 1. This particular


analytic

map has the important


property

of mapping the right half plane R = x = Re z > 0 to the unit disk D = | |2 < 1 .
Indeed, by (15.65)
| |2 = 2 + 2 < 1

if and only if

x=

1 2 2
> 0.
(1 )2 + 2

Note that the denominator does not vanish on the interior of the disk.
The complex functions (15.60), (15.61), (15.64) are particular examples of one of the
most important class of analytic maps. A general linear fractional transformation has the
form
z +
=
,
(15.66)
z+
where , , , are arbitrary complex constants, subject to the restriction
6= 0,
since otherwise (15.66) reduces to a trivial constant (and non-invertible) map.
3/7/03

667

c 2003

Peter J. Olver

Example 15.26. The linear fractional transformation


=

z
z 1

where

| | < 1,

(15.67)

maps the unit disk to itself, moving the origin z = 0 to the point = . To prove this, we
note that
| z |2 = (z )(z ) = | z |2 z z + | |2 ,
| z 1 |2 = ( z 1)( z 1) = | |2 | z |2 z z + 1.
Subtracting these two formulae, and using the assumptions that | z | < 1, | | < 1, we find

| z |2 | z 1 | 2 = 1 | |2
| z |2 1 < 0,
so
| z | < | z 1 |.
The latter inequality implies that
| | =

|z |
<1
|z 1|

provided

| z | < 1,

| | < 1,

and hence lies within the unit disk.


The rotations (15.59) also map the unit disk to itself, preserving the origin. It can be
proved, [4], that the only invertible analytic mappings that take the unit disk to itself are
obtained by composing such a linear fractional transformation with a rotation.
Proposition 15.27. If = g(z) is a one-to-one analytic map that takes the unit
disk to itself, then
z
(15.68)
g(z) = e i
z 1
for some | | < 1 and 0 < 2 .

Additional specific properties of linear fractional transformations are outlined in the


exercises. The most important is that they map circles to circles, where, to be completely
accurate, one should view a straight line as a circle of infinite radius. Details can be
found in Exercise .
Conformality
A remarkable geometrical characterization of complex analytic functions is the fact
that, at non-critical points, they preserve angles. The mathematical term for this property
is conformal mapping. Conformality makes sense for any inner product space, although in
practice one usually deals with Euclidean space equipped with the standard dot product.
Definition 15.28. A function g: R n R n is called conformal if it preserves angles.
What does it mean to preserve angles? For the Euclidean norm, the angle between
two vectors is defined by their dot product, as in (3.15). However, most analytic maps are
nonlinear, and so will not map vectors to vectors since they will typically map straight
lines to curves. However, if we interpret angle to mean the angle between two curves,
as illustrated in Figure conf , then we can make sense of the conformality requirement.
3/7/03

668

c 2003

Peter J. Olver

Consequently, in order to realize complex functions as conformal maps, we first need to


understand what they do to curves.
In general, a curve C C in the complex plane is parametrized by a complex-valued
function
z(t) = x(t) + i y(t),
a < t < b,
(15.69)
that depends on a real parameter t. Note that there is no essential difference between
a complex plane curve (15.69) and a real plane curve (A.1) we have merely switched
T
from vector notation x(t) = ( x(t), y(t) ) to complex notation z(t) = x(t) + i y(t). All
the vectorial curve terminology (closed, simple, piecewise smooth, etc.) we learned in
Chapter A is used without any modification here. In particular, the tangent vector to the

curve can be identified as the complex number z(t) = x(t) + i y(t). Smoothness of the

curve is guaranteed by the requirement that z(t) 6= 0.


Example 15.29.
(a) The curve
z(t) = e i t = cos t + i sin t,

for

0 t 2 ,

parametrizes the unit circle | z | = 1 in the complex plane, which is a simple closed curve.

Its complex tangent is z(t) = i e i t = i z(t), which is obtained by rotating z through 90 ;


see Figure Cc .
(b) The complex curve
z(t) = cosh t + i sinh t =

1 + i t 1 i t
e +
e ,
2
2

< t < ,

parametrizes the right hand branch of the hyperbola

Re z 2 = x2 y 2 = 1.

The complex tangent vector is z(t) = sinh t + i cosh t = i z(t).

In order to better understand the geometry, it will help to rewrite the tangent z in
polar coordinates. We interpret the curve as the motion of a particle in the complex plane,

so that z(t) is the position of the particle at timept, and the tangent z(t) its instantaneous

velocity. The modulus of the tangent, | z | = x2 + y 2 , indicates the particles speed,


while its phase ph z = tan1 (y/x) measures the direction of motion, or, more precisely,
the angle that the curve makes with the horizontal; see Figure Cct .
The angle between two curves is defined as the angle between their tangents at the

point of intersection. If the curve C1 makes an angle 1 = ph z 1 (t1 ) while the curve C2 has

angle 2 = ph z 2 (t2 ) at the common point z = z1 (t1 ) = z2 (t2 ), then the angle between
the two curves at z is the difference

z
= 2 1 = ph z 2 ph z 1 = ph 2 .
z1

3/7/03

669

(15.70)
c 2003

Peter J. Olver

Now, suppose we are given an analytic map = g(z). A curve C parametrized by


z(t) will be mapped to a curve = g(C) parametrized by the composition (t) = g(z(t)).
The tangent to the image curve is related to that of the original curve by the chain rule:

d
dg dz

=
,
or
(t) = g 0 (z(t)) z(t).
(15.71)
dt
dz dt

Therefore, the effect of the analytic map on the tangent vector z at the point z C is to
multiply it by the complex number g 0 (z). If the analytic map satisfies our key assumption

g 0 (z) 6= 0, then 6= 0, and so the image curve will remain smooth.


According to equation (15.71),

| | = | g 0 (z) z | = | g 0 (z) | | z |.

(15.72)

Thus, the speed of motion along the new curve (t) is multiplied by a factor = | g 0 (z) | > 0.
The magnification factor depends only upon the point z and not how the curve passes
through it. All curves passing through the point z are speeded up (or slowed down if < 1)
by the same factor! Similarly, the angle that the new curve makes with the horizontal is
given by

ph = ph g 0 (z) z = ph g 0 (z) + ph z,
(15.73)

where we use the fact that the phase of the product of two complex numbers is the sum of
their individual phases, (3.72). Therefore, the tangent angle of the curve is increased by an
amount = ph g 0 (z). Geometrically, this means that the curve has been rotated through
an angle , as in Figure aamc . Again, the increase in tangent angle only depends on the
point z, and all curves passing through z are rotated by the same amount . As a result,
the angle between any two curves is preserved. More precisely, if C1 is at angle 1 and C2
at angle 2 at a point of intersection, then their images 1 = g(C1 ) and 2 = g(C2 ) are
at angles 1 = 1 + and 2 = 2 + . The angle between the two image curves is the
difference
2 1 = (2 + ) (1 + ) = 2 1 ,
which is the same as the angle between the original curves. This proves the conformality
or angle-preservation property of analytic maps.
Theorem 15.30. If = g(z) is an analytic function and g 0 (z) 6= 0, then g defines a
conformal map.
Remark : The converse is also valid every planar conformal map comes from a complex
analytic function with nonvanishing derivative. A proof is outlined in Exercise .
The conformality of a analytic functions is all the more surprising when one reconsiders
elementary examples. In Example 15.24, we discovered that the function w = z 2 maps a
quarter plane to a half plane, and therefore doubles the angle at the origin! Thus g(z) = z 2
is most definitely not conformal at z = 0. The explanation is, of course, that it has zero
derivative at z = 0, and Theorem 15.30 only guarantees conformality when the derivative
is nonzero. Amazingly, the map preserves angles everywhere else! Somehow, the angle
at the origin is doubled, while the angles at all nearby points are preserved. Figure z2c
illustrates this remarkable and counter-intuitive feat.
3/7/03

670

c 2003

Peter J. Olver

Example 15.31. A particularly interesting conformal transformation is given by the


function

1
1
=
z+
.
(15.74)
2
z
The Joukowski map arises in the study of flows around airplane wings, since it maps
circles to a variety of airfoil shapes whose aerodynamic properties can be analyzed exactly,
and is named after the pioneering Russian aero- and hydro-dynamics researcher Nikolai
Zhukovskii (Joukowski). Since

1
1
d
=
if and only if
z = 1,
1 2 =0
dz
2
z
the Joukowski map is conformal except at the critical points z = 1, as well as the
singularity z = 0, where it is not defined.
If z = e i lies on the unit circle, then

= 12 e i + e i = cos ,
lies on the real axis, with 1 1. Thus, the Joukowski map squashes the unit circle
down to the real line segment [ 1, 1 ]. The points outside the unit circle fill the rest of the
plane, as do the (nonzero) points inside the unit circle. Indeed, if we solve (15.74) for
p
z = 2 1 ,
(15.75)

we see that every except 1 comes from two different points z; for not on the line
segment [ 1, 1 ] the points inside and outside the unit circle, whereas if 1 < < 1, the
points lie directly above and below it on the circle.Therefore,
(15.74) defines a one-to-one
conformal map from the exterior of the unit circle | z | > 1 onto the exterior of the unit
line segment C \ [ 1, 1 ].
Under the Joukowski map, the concentric circles | z | = r for r 6= 1 are mapped to
ellipses with focii at 1 in the plane, as illustrated in Figure Jtr . The effect on circles
not centered at the origin is quite interesting. The image curves take on a wide variety
of shapes; several examples are plotted in Figure airfoil . If the circle passes through the
singular point z = 1, then its image is no longer smooth, but has a cusp at = 1. Some of
the image curves have the shape of the cross-section through an airplane wing or airfoil .
Later we will see how to apply the Joukowski map to construct the physical fluid flow
around such an airfoil, which proved to be a critical step in early airplane design.
Composition and The Riemann Mapping Theorem
One of the strengths of the method of conformal mapping is that one can build up
lots of complicated examples by simply composing elementary mappings. According to
Lemma 15.20, if w = h(z) and = k(w) are analytic functions, their composition =
g(z) = k h(z) = k(h(z)) is also analytic. If both h and k are one-to-one, so is the
composition g = k h. Moreover, the composition of two conformal maps is also conformal.
Indeed, by the chain rule,
g 0 (z) = k 0 (h(z)) h0 (z) 6= 0

provided

k 0 (h(z)) 6= 0

and

h0 (z) 6= 0,

and so if h and k satisfy the conformality condition (15.56), so does g = k h.


3/7/03

671

c 2003

Peter J. Olver

Example 15.32. As we learned in Example 15.23, the exponential function


w = ez
maps the horizontal strip S = { 12 < Im z < 12 } conformally onto the right half plane
R = { Re w > 0 }. On the other hand, Example 15.25 tells us that the linear fractional
transformation
w1
=
w+1
maps the right half plane R conformally to the unit disk D = { | | < 1 }, as in Figure compz . Therefore, the composition
ez 1
ez + 1
is a one-to-one conformal map from the horizontal strip S to the unit disk D.
=

(15.76)

Recall that our motivating goal is to use analytic/conformal maps to transform a


boundary value problem for the Laplace equation on a complicated domain to a boundary
value problem on the unit disk. Since we already know how to solve the latter, the method
effectively constructs a solution to the original problem. Of couse, the key question the
student should be asking at this point is: Can you construct a conformal map = g(z)
from a given domain to the unit disk D = g()?
The theoretical answer to this question is the celebrated Riemann Mapping Theorem.
Theorem 15.33. If ( C is any simply connected open subset, not equal to the
entire complex plane, then there exists a one-to-one analytic function = g(z) that maps
to the unit disk D = { | | < 1 }.
Thus, any simply connected open set, including all domains, can be conformally
mapped the unit disk the one exception is the entire complex plane. (See Exercise
for a reason for this exception.) Note that the domain does not have to be bounded
for this result to hold. For example, the conformal map (15.64) takes the unbounded right
half plane R = { Re z > 0 } to the unit disk. The proof of this important theorem is not
easy and reliees some more advanced results in complex analysis, [4].
The Riemann Mapping Theorem guarantees the existence of a conformal map from any
simply connected domain to the unit disk, but it is an existential result, and gives no clue as
to how to actually construct the desired mapping. And, in general, this is not an easy task.
In practice, one assembles a repertoire of useful conformal maps that apply to particular
domains of interest. One extensive catalog can be found in [Cmap]. More complicated
maps can then be built up by composition of the basic examples. Ultimately, though, the
determination of a suitable conformal map is often more an art than a systematic science.
Let us consider a few additional examples beyond those already encountered:
Example
15.34.
Suppose we are asked
to conformally
map the upper half plane

U = Im z > 0 to the unit disk D = | | < 1 . We already know that the linear
fractional transformation
z1
= g(z) =
z+1
3/7/03

672

c 2003

Peter J. Olver

maps the right half plane R = Re z > 0 to D = g(R). On the other hand, multiplication
by i = e i /2 , with z = h(w) = i w, rotates the complex plane by 90 and so maps the
right half plane R to the upper half plane U = h(R). Its inverse h1 (z) = i z will
therefore map U to R = h1 (U ). Therefore, to map the upper half plane to the unit disk,
we compose these two maps, leading to the conformal map
iz + 1
iz 1
=
iz + 1
iz 1

(15.77)

i z2 + 1
i z2 1

(15.78)

= g h1 (z) =

from U to D.
As a second example,
we already know
that the squaring map w = z 2 maps the upper

right quadrant Q = 0 < ph z < 12 to the upper half plane U . Composing this with
our previously constructed map (15.77) leads to the conformal map
=

that maps the quadrant Q to the unit disk D.

Example 15.35. The goal of this example is to construct an conformal map that
takes a half disk

D+ = | z | < 1, y = Im z > 0
(15.79)

to the full unit disk D. The answer is not = z 2 because the image omits the positive
real axis, and so is a disk with a slit cut out of it. The first observation is that the
map z = (w 1)/(w
+ 1) that we analyzed in Example 15.25 takes the right half plane
R
= Re w >1 0 to the unit disk. Moreover, it maps the upper right quadrant Q =
0 < ph w < 2 to the half disk (15.79). Its inverse,
w=

z+1
z1

will therefore map the half disk to the upper right quadrant.
On the other hand, we just constructed a conformal map (15.78) that takes the upper
right quadrant Q to the unit disk. Therefore, if compose the two maps (replacing z by w
in (15.78)), we obtain the desired conformal map
2

z+1
i
+1
i w2 + 1
( i + 1)(z 2 + 1) + 2( i 1)z
z1
=
=
=
.

2
i w2 1
( i 1)(z 2 + 1) + 2( i + 1)z
z+1
i
1
z1
The formula can be further simplified by multiplying numerator and denominator by i +1,
and so
z2 + 2 i z + 1
(15.80)
= i 2
.
z 2iz + 1
The leading factor i is unimportant and can be omitted, since it merely rotates the disk
by 90 .
3/7/03

673

c 2003

Peter J. Olver

Finally, we remark that the conformal map guaranteed by the Riemann Mapping
Theorem is not unique. Since the linear fractional transformations (15.67) map the unit
disk to itself, we can compose them with any Riemann mapping to produce additional
maps from a simply-connected domain to the unit disk. For example, composing (15.67)
with (15.76) produces a family of mappings
1 + ez (1 ez )
,
(1 + ez ) 1 + ez

which, for any | | < 1, maps the strip S = 21 < Im z <

(15.81)

1
2

Annular Domains

onto the unit disk.

The Riemann Mapping Theorem does not apply directly to non-simply connected
domains. For purely topological reasons, a hole cannot be made to disappear under a
one-to-one continuous mapping much less a conformal map!
The simplest non-simply connected domains is an annulus consisting of the points
between two concentric circles

Ar,R = r < | | < R ,


(15.82)

which, for simplicity, is centered at the origin. It can be proved, [Cmap], that any other
domain with a single hole can be mapped to an annulus. The annular radii r, R are not
uniquely specified; indeed the linear map = z maps the annulus (15.82) to a rescaled
annulus A r, R whose inner and outer radii have both been scaled by the factor = | |.
The ratio r/R of the inner to outer radius of the annulus is uniquely specified; annuli
with different ratios cannot be mapped to each other by a conformal map. Thus, unlike
simply connected domains, there are many standard multiply connected domains.
Example 15.36. Consider the domain

= | z | < 1 and | z c | > c

contained between two nonconcentric circles. To keep the computations simple, we take
the outer circle to have radius 1 (which can always be arranged by scaling, anyway) while
the inner circle has center at the point z = c on the real axis and radius c, which means
that it passes through the origin. We must restrict c < 12 in order that the inner circle not
overlap with the outer circle. Our goal is to conformally map this non-concentric annular
domain to a concentric annulus of the form

Ar,1 = r < | | < 1


by a conformal map = g(z); see Figure ann .
Now, according, to Example 15.26, a linear fractional transformation of the form
= g(z) =

z
z 1

with

|| < 1

(15.83)

If r = 0 or R = , then r/R = 0 by convention.

3/7/03

674

c 2003

Peter J. Olver

maps the unit disk to itself. Moreover, as remarked earlier, and demonstrated in Exercise
, linear fractional transformations always map circles to circles. Therefore, we seek a
particular value of that maps the inner circle | z c | = c to a circle of the form | | = r
centered at the origin. We choose real and try to map the points 0 and 2c on the inner
circle to the points r and r on the circle | | = r. This requires
g(0) = = r,

g(2 c) =

2c
= r.
2c 1

(15.84)

Substituting the first into the second leads to the quadratic equation
c 2 + c = 0.
There are two real solutions:

1 1 4 c2
=
2c

and

1+

1 4 c2
.
2c

(15.85)

Since 0 < c < 21 , the second solution has > 1, and hence is inadmissible. Therefore, the
first solution gives the required conformal map

z 1 + 1 4 c2

.
=
(1 1 4 c2 ) z 2 c
Note in particular that the radius r = of the inner circle in Ar,1 is not the same as the
radius c of the inner circle in .
For example, taking c = 52 , we find = 12 , and hence the linear fractional trans

2z 1
maps the annular domain = | z | < 1 and z 25 > 52 to
formation =
z2

the concentric annulus A = A.5,1 = 21 < | | < 1 . In Figure nonc , we plot the nonconcentric circles in that map to concentric circles in the annulus A. In Exercise the
reader is asked to adapt this construction to a general non-concentric annular domain.
Applications to Harmonic Functions and Laplaces Equation
Let us now apply what we have learned about analytic/conformal maps to the study
of harmonic functions and boundary value problems for the Laplace equation. Suppose
= g(z) defines a one-to-one conformal map from the domain z onto the domain
D. In many applications, the target domain D is the unit disk | | < 1, but this is
not necessary for the time being. According to Lemma 15.20, composing the conformal
map g takes analytic functions F () defined on D to analytic functions f (z) = F (g(z))
on , and hence defines a change of variables between their harmonic real and imaginary
parts. In fact, this property does not even require the harmonic function to be the real
part of an analytic function, i.e., we are not required to assume the existence of a harmonic
conjugate.
Proposition 15.37. If U (, ) is a harmonic function of , , and
+ i = p(x, y) + i q(x, y)
3/7/03

675

(15.86)
c 2003

Peter J. Olver

is any analytic mapping, then the composition


u(x, y) = U (p(x, y), q(x, y))

(15.87)

is a harmonic function of x, y.
Proof : This is a straightforward application of the chain rule:
u
U
U
=
+
,
x
x
x
2
2 U
2 U
2u
+
2
=
x2
2 x
x

2
2u
2 U
2 U
+
2
=
y 2
2 y
y

u
U
U
=
+
,
y
y
y
2
U 2
2 U
U

+
+
+
x
2 x
x2

2
U 2
2 U
U
+
+
+
y
2 y
y 2

2
,
x2
2
.
y 2

Using the CauchyRiemann equations

=
,
x
y

=
,
y
x

for the analytic function = + i , we find, after some algebra,


"

2
2 # 2

2U

U
2u 2u
+ 2 =
+
.
+
x2
y
x
x
2
2
Therefore,
0

u = | g (z) | U

where

| g (z) | =

We conclude that whenever U (, ) is any harmonic function, and so a solution to the


Laplace equation U = 0 (in the , variables), then u(x, y) is a solution to the Laplace
equation u = 0 in the x, y variables, and is thus also harmonic.
Q.E.D.
This observation has profound consequences for boundary value problems arising in
physical applications. Suppose we wish to solve the Dirichlet problem
u = 0

in

u=h

on

on a simply connected domain ( C. (The Riemann Mapping Theorem 15.33 tells us


to exclude the case = C. Indeed, this case is devoid of boundary conditions, and
so the problem does not admit a unique solution.) If we can find a complex function
= g(z) = p(x, y)+ i q(x, y) that defines a one-to-one conformal mapping from the domain
to the unit disk D, then we can use the change of variables formula (15.87) to map the
harmonic function u(x, y) on to a harmonic function U (, ) on D. Moreover, the
boundary values of U = H on the unit circle D correspond to those of u = h on by
the same change of variables formula:
h(x, y) = H(p(x, y), q(x, y)),
3/7/03

676

for

(x, y) .
c 2003

(15.88)
Peter J. Olver

We conclude that U (, ) solves the Dirchlet problem


U = 0

in

D,

U =H

on

D.

But we already know how to solve the Dirichlet problem on the unit disk! Namely, the
Poisson integral formula (14.44) gives U (, ). The corresponding solution
to our original

boundary value problem is given by the composition formula u(x, y) = U p(x, y), q(x, y) .
Thus, the solution to the Dirichlet problem on a unit disk can be used to solve the Dirichlet
problem on a more complicated planar domain provided we know the conformal map
whose existence is guaranteed by the Riemann Mapping Theorem 15.33.
Example 15.38. According to Example 15.25, the analytic function
2y
x2 + y 2 1
z1
(15.89)
+i
=
2
2
z+1
(x + 1) + y
(x + 1)2 + y 2

maps the right half plane R = x = Re z > 0 to the unit disk D = | | < 1 . Proposition 15.37 implies that if U (, ) is a harmonic function in the unit disk, then
2

x + y2 1
2y
u(x, y) = U
,
(15.90)
(x + 1)2 + y 2 (x + 1)2 + y 2
+ i = =

is a harmonic function on the right half plane.


To solve the Dirichlet boundary value problem
u = 0,

x > 0,

u(0, y) = h(y),

(15.91)

on the right half plane, we adopt the change of variables (15.89) and use the Poisson
integral formula to construct the solution to the transformed Dirichlet problem
U = 0,

2 + 2 < 1,

U (cos , sin ) = H(),

(15.92)

on the unit disk. The boundary conditions are found as follows. Using the explicit form
x + iy = z =

(1 + )(1 )
1 + | |2
1 2 2 + 2 i
1+
=
=
=
1
| 1 |2
| 1 |2
( 1)2 + 2

for the inverse map, we see that the boundary point the boundary point = + i = e i
on the unit circle D will correspond to the boundary point
2
i
2 i sin

=
=
= i cot
(15.93)
2
2
2
2
( 1) +
1 cot
2
(cos 1) + sin

on the imaginary axis R = Re z = 0 . Thus, the boundary data h(y) on R corresponds


to the boundary data

H() = h cot 12
iy =

on the unit circle. The Poisson integral formula (14.44) can then be applied to solve the
problem (15.92), from which we reconstruct the solution (15.90) to the boundary value
problem (15.90) on the half plane.
3/7/03

677

c 2003

Peter J. Olver

For example, to solve the problem with the step function

1,
y > 0,
u(0, y) = h(y)
0,
y < 0,
as boundary data, the corresponding boundary data on the unit disk is a (periodic) step
function

1,
0 < < ,
H() =
0,
< < 2 ,
with values + 1 on the upper semicircle, 1 on the lower semicircle, and jump discontinuities at = 1. According to the Poisson formula (14.44), the solution to the latter
boundary value problem is given by
Z
1
1 2
U (, ) =
d
2 0 1 + 2 2 cos( )

1+
1+
1
1
=
cot
tan
tan
+ tan

1
2
1
2

where

= cos ,
= sin .

Finally, we use (15.90) to construct the solution on the upper half plane. We shall spare
the reader the messy details of the final formula. The result is depicted in Figure zpm1h .
Remark : The solution to the preceding Dirichlet boundary value problem is not, in
fact, unique, owing to the unboundedness of the domain. The solution that we pick out by
using the conformal map to the unit disk is the one that remains bounded at . There are
other solutions, but they are unbounded as | z | and would correspond to solutions
on the unit disk that have some form of delta function singularity in their boundary data
at the point 1; see Exercise .
Example 15.39. A non-coaxial cable. The goal of this example is to determine
the electrostatic potential inside a non-coaxial cylindrical cable with prescribed constant
potential values on the two bounding cylinders; see Figure c2 . Assume for definiteness
that the larger cylinder has radius 1, and centered at the origin, while the smaller cylinder
has radius 25 , and is centered at z = 52 . The resulting electrostatic potential will be
independent of the longitudinal coordinate, and so can be viewed as a planar potential in
the annular domain contained between two circles representing the cross-sections of our
cylinders. The desired potential must satisfy the Dirichlet boundary value problem

u = 0,
| z | < 1 and z 52 > 25 ,

u = a, | z | = 1,
u = b, z 2 = 2 .
5

2z 1
According to Example 15.36, the linear fractional transformation =
will map
z 2
1
this non-concentric annular domain to the annulus A.5,1 = 2 < | | < 1 , which is the
cross-section of a coaxial cable. The corresponding transformed potential U (, ) has the
given Dirichlet boundary conditions U = a on | | = 21 and U = b on | | = 1. Clearly the
3/7/03

678

c 2003

Peter J. Olver

coaxial potential U must be a radially symmetric solution to the Laplace equation, and
hence, according to (14.59), of the form
U (, ) = log | | + ,
for constants , . A short computation shows that the particular potential function
U (, ) =

ba
ba
log | | + b =
log( 2 + 2 ) + b
log 2
2 log 2

satisfies the prescribed boundary conditions. Therefore, the desired non-coaxial electrostatic potential

2z 1
ba
(2x 1)2 + y 2
ba

+b=
log
log
+ b.
(15.94)
u(x, y) =
log 2
z2
2 log 2
(x 2)2 + y 2
is given by composition with the given linear fractional transformation. The particular
case a = 0, b = 1 is plotted in Figure coax .

Remark : The same harmonic function solves the problem of determining the equilibrium temperature in an annular plate whose inner boundary is kept at a temperature
u = a while the outer boundary is kept at temperature u = b. One could also interpret this
solution as the equilibrium temperature of a three-dimensional domain contained between
two non-coaxial cylinders held at fixed temperatures. The latter temperature will only
depend upon the transverse x, y coordinates and not upon the longitudinal coordinate.
Remark : A conformal map will also preserve Neumann boundary conditions, specifying the normal derivative u/n = h on the boundary. Indeed, since a conformal map
preserves angles, it maps the normal to to the normal to D at the image point.
Therefore, the transformed harmonic function U (, ) will satisfy the Neumann conditions
U/n = H, where H is related to h via the same equation (15.88).
Applications to Fluid Flow
Conformal mappings are particularly useful in the analysis of planar ideal fluid flow.
Recall that if (z) = (x, y) + i (x, y) is an analytic function that represents the complex
potential function for a steady state fluid flow, then we can interpret its real part (x, y) as
the velocity potential, while the imaginary part (x, y) is the harmonic conjugate stream
function. The level curves of are the equipotential lines, and these are orthogonal to the
level curves of , which are the streamlines followed by the individual fluid particles
except at stagnation points where 0 (z) = 0.
Applying a conformal map = g(z) leads to a transformed complex potential () =
(, ) + i (, ), where (, ) is the potential function and (, ) the stream function
on the new domain. A key fact is that the conformal map will take isopotential lines of
to isopotential lines of and streamlines of to streamlines of . Conformality implies
that the orthogonality relations among isopotentials and streamlines away from stagnation
points is maintained.
3/7/03

679

c 2003

Peter J. Olver

Let us concentrate on the case of flow past a solid object. In three dimensions, the
object is assumed to have a uniform shape in the axial direction, and so we can restrict
our attention to a planar fluid flow around a closed, bounded planar subset D R 2 '
C representing the cross-section of our cylindrical object. The (complex) velocity and
potential are defined on the complementary domain = C \ D occupied by the fluid. The
ideal flow assumptions of incompressibility and irrotationality are reasonably accurate if
the flow is laminar, i.e., far away from turbulent. Then the velocity potential (x, y) will
satisfy the Laplace equation = 0 in the exterior domain . For a solid object, we
should impose the homogeneous Neumann boundary conditions

=0
on the boundary
= D,
(15.95)
n
indicating that there no fluid flux into the object. We note that, according to Exercise ,
a conformal map will automatically preserve the Neumann boundary conditions.
In addition, since the flow is taking place on an unbounded domain, we need to
specify the fluid motion at large distances. We shall assume our object is placed in a
uniform horizontal flow, as in Figure hflow . Thus, at large distance, the flow will not be
affected by the object, and so the velocity should approximate the uniform velocity field
T
v = ( 1, 0 ) , where, for simplicity, we choose our physical units so that the asymptotic
speed of the fluid is equal to 1. Equivalently, the velocity potential should satisfy
(x, y) x,

when

x2 + y 2 0.

Remark : An alternative physical interpretation is that the fluid is at rest, while the
object moves through the fluid at unit speed 1 in a horizontal direction. For example, think
of an airplane flying through the air at constant speed. If we adopt a moving coordinate
system by sitting inside the plane, then the effect is as if the object is sitting still while
the air is moving towards us at unit speed.
Example 15.40. The simplest example is a flat plate moving through the fluid
in a horizontal direction. The plates cross-section is a horizontal line segment, and, for
simplicity, we take it to be the segment D = [ 1, 1 ] lying on the real axis. If the plate is
very thin, it will have absolutely no effect on the horizontal flow of the fluid, and, indeed,
the velocity potential is given by
(x, y) = x,
T

x + i y = C \ [ 1, 1 ].

Note that = ( 1, 0 ) , and hence this flow satisfies the Neumann boundary conditions
(15.95) on the horizontal segment D = . The corresponding complex potential is (z) =
z, with complex veclocity f (z) = 0 (z) = 1.
Example 15.41. Recall that the Joukowski conformal map defined by the analytic
function

1
1
z+
(15.96)
= g(z) =
2
z
squashes the unit circle | z | = 1 down to the real line segment [ 1, 1 ] in the plane.
Therefore, it will map the fluid flow outside the unit disk (the cross-section of a circular
3/7/03

680

c 2003

Peter J. Olver

cylinder) to the fluid flow past the line segment, which, according to the previous example,
has complex potential () = . As a result, the complex potential for the flow past a
disk is the same as the Joukowski function

1
1
(z) = g(z) = g(z) =
z+
.
(15.97)
2
z
Except for a factor of 21 , this agrees with the flow potential we derived in Example 15.18.
The difference is that, at large distances, the current potential
(z)

1
2

for

| z | 1.

T
corresponds to uniform horizontal flow whose velocity 21 , 0
is half as fast. The discrepancy between the two flows can easily be rectified by multiplying (15.97) by 2, whose only
effect is to speed up the flow.
Example 15.42. Let us next consider the case of a tilted plate in a uniformly
horizontal fluid flow. Thus, the cross-section is the line segment
z(t) = t e i ,

1 t 1,

obtained by rotating the horizontal line segment [ 1, 1 ] through an angle , as in Figure tilt . The goal is to construct a fluid flow past the tilted segment that is asymptotically
horizontal at large distance.
The critical observation is that, while the effect of rotating a plate in a fluid flow is
not so evident, we can easily rotate the disk in the flow since it is circularly symmetric,
rotations dont affect it. Thus, the rotation w = e i z maps the Joukowski potential
(15.97) to the complex potential

1
e i
i
i
(w) = (e w) =
e w+
.
2
w
The streamlines of the induced flow are no longer asymptotically horizontal, but rather at
an angle . If we now apply the original Joukowski map (15.96) to the rotated flow, the
circle is again squashed down to the horizontal line segment, but the flow lines continue
to be at angle at large distances. Thus, if we then rotate the resulting flow through
an angle , the net effect will be to tilt the segment to the desired angle while rotating
the streamlines to be asymptotically horizontal. Putting the pieces together, we have the
final complex potential in the form

p
i
2
2
i

.
(15.98)
(z) = e
z cos i sin z e
Sample streamlines for the flow at several attack angles are plotted in Figure tilt .

Example 15.43. As we discovered in Example 15.31, applying the Joukowski map


to off-center disks will, in favorable configurations, produce airfoil-shaped objects. The
fluid motion around such airfoils can thus be obtained by applying the Joukowski map to
the flow past such an off-center circle.
3/7/03

681

c 2003

Peter J. Olver

First, an affine map w = z + will have the effect of moving the original unit
disk | z | 1 to the disk | w | | | with center and radius | |. In particular, the
boundary circle will continue to pass through the point w = 1 provided | | = | 1 |.
Moreover, as noted in Example 15.21, the angular component of has the effect of a
rotation, and so the streamlines around the new disk will, asymptotically, be at an angle
= ph with the horizontal. We then apply the Joukowski transformation

1
1
1
1
=
w+
=
z + +
(15.99)
2
w
2
z +
to map the disk to the airfoil shape. The resulting complex potential for the flow past the
airfoil is obtained by substituting the inverse map
p
+ 2 1
w
=
,
z=

into the original potential (15.97), whereby

!
p
p
1 + 2 1 ( 2 1 )
.
+
() =
2

2 + 1 2
Since the streamlines have been rotated through an angle = ph , we then rotate the
final result back by multiplying by e i in order to see the effect of the airfoil tiled at an
angle in a horizontal flow. Sample streamlines are graphed in Figure airfoilnolift .
We can interpret all these examples as planar cross-sections of three-dimensional fluid
flows past an airplane wing oriented in the longitudinal z direction. The wing is assumed to
have a uniform cross-section shape, and the flow not dependent upon the axial z coordinate.
For wings that are sufficiently long and reaonsalbe (laminar) flows, this model will be valid
away from the wing tips. More complicated airfoils with varying cross-section and faster
flows require a fully three-dimensional fluid model. For such problems, complex analysis
is no longer applicable, and, for the most part, one must rely on numerical integration
techniques. Only in recent years have computers become sufficiently powerful to compute
realistic three-dimensional fluid motions and then only in reasonable mild scenarios .
The two-dimensional versions that have been analyzed here still provide important clues
to the behavior of a three-dimensional flow, as well as useful approximations and starting
points for the three-dimensional airplane wing design problem.
Unfortunately, there is a major flaw with the airfoils that we have just designed.
Potential flows do not produce any lift, and hence the theory indicates that the airplane
will not fly. In order to understand how lift enters into the picture, we need to study
complex integration, and so we will return to this example later. In Example 15.57,
we shall construct an alternative flow past an airfoil that continues to have the correct
asymptotic behavior at large distances, while inducing a nonzero lift. The latter holds the
secret to flight.

The definition of mild relies on the magnitude of the Reynolds number, [ fluid ].

3/7/03

682

c 2003

Peter J. Olver

Poissons Equation and the Greens Function


Although designed for solving the homogeneous Laplace equation, the method of conformal mapping can also be used to solve its inhomogeneous counterpart the Poisson
equation. As we learned in Chapter 14, to solve an inhomogeneous boundary value problem u = f on a domain it suffices to solve the particular versions u = whose
right hand side is a unit impulse concentrated at a point = + i . The resulting
solution u(x, y) = G (x, y) = G(x, y; , ) is the Greens function for the given boundary
value problem. The solution to the boundary value problem associated with a more general
external forcing f (x, y) is then given by a superposition principle
ZZ
G(x, y; , ) f (, ) d d.
(15.100)
u(x, y) =

For the planar Poisson equation, the starting point is the logarithmic potential function

1
1
log | z | = Re
log z,
2
2
which is the solution to the Dirichlet problem
u(x, y) =

u = 0 (x, y),

(x, y) D,

u=0

(15.101)

on

D,

on the unit disk D for an impulse concentrated at the origin; see Section 14.3 for details.
How do we obtain the corresponding solution when the unit impulse is concentrated at
another point = + i D instead of the origin? According to Example 15.26, the
linear fractional transformation
w = g(z) =

z
,
z1

where

| | < 1,

(15.102)

maps the unit disk to itself, moving the point z = to the origin w = g() = 0. The
1
logarithmic potential U =
log | w | will thus be mapped to the Greens function
2

z
1

(15.103)
log
G(x, y; , ) =
2
z1

at the point = + i . Indeed, by the properties of conformal mapping, since U is


harmonic except at the singularity w = 0, the function (15.103) will also be harmonic
except at the image point z = . The fact that the mapping does not affect the delta
function singularity is not hard to check; see Exercise . Moreover, since the conformal
map does not alter the boundary | z | = 1, the function (15.103) continues to satisfy the
homogeneous Dirichlet boundary conditions.
Formula (15.103) reproduces the Poisson formula (14.70) for the Greens function
that we derived previously using the method of images. This identification can be verified
by substituting z = r e i , = e i , or, more simply, by noting that the numerator in
the logarithmic fraction gives the potential due to a unit impulse at z = , while the
denominator represents the image potential at z = 1/ required to cancel out the effect of
the interior potential on the boundary of the unit disk.
3/7/03

683

c 2003

Peter J. Olver

Now that we know the Greens function on the unit disk, we can use the methods of
conformal mapping to produce the Greens function for any other simply connected domain
( C. Let w = g(z) denote the conformal map that takes the domain z to the unit
disk w D, guaranteed by the Riemann Mapping Theorem 15.33. The Greens function
associated with homogeneous Dirichlet boundary conditions on is explicitly given by

g(z) g()
1

log
(15.104)
G(z; ) =
.
g() g(z) 1
2
Example 15.44. For example, according to Example 15.25, the analytic function
w=

z1
z+1

maps the right half plane x = Re z > 0 to the unit disk | | < 1. Therefore, by (15.104),
the Greens function for the right half plane has the form

z1 1

( + 1)(z )
1
1
z+1 +1
.

G(z; ) =
log
log
(15.105)
=

2
z1 1
2
(z
+
1)(z

z+1 +1 1

One can then write the solution to the Poisson equation in a superposition as in (15.100).

15.5. Complex Integration.


All of the magic and power of calculus ultimately rests on the amazing fact that differentiation and integration are mutually inverse operations. And, just as complex functions
have many remarkable differentiability properties not enjoyed by their real siblings, so
complex integration theory has a extra beauty and structure beyond its more mundane
real counterpart. In the remaining two sections of this chapter, we shall develop the basics
of complex integration theory and discuss some of its important applications.
First, let us motivate the definition of a complex integral. As you know, the integral
Z b
of a real function,
f (t) dt, is usually taken along a real interval [ a, b ] R. In complex
a

function theory, integrals are taken along curves in the complex plane, and are thus intimately related to the line integrals appearing in real vector calculus. The identification of
T
a complex number z = x + i y with a planar vector x = ( x, y ) will immediately connect
the two concepts.
Consider a curve C in the complex plane, parametrized, as in (15.69), by z(t) =
x(t) + i y(t) for a t b. We define the complex integral of a complex function f (z) along
the curve C to be
Z
Z b
dz
dt.
(15.106)
f (z) dz =
f (z(t))
dt
C
a
3/7/03

684

c 2003

Peter J. Olver

We shall always assume that the integrand f (z) is a well-defined complex function at each
point on the curve. The result of complex integration of a function along a curve is a
complex number. Let us write out the integrand
f (z) = u(x, y) + i v(x, y)
in terms of its real and imaginary parts. Also, note that

dx
dy
dz
dt =
+i
dz =
dt = dx + i dy.
dt
dt
dt
In this manner, we discover that the complex integral (15.106) splits up into two real line
integrals
Z
Z
Z
Z
f (z) dz =
(u + i v)(dx + i dy) =
(u dx v dy) + i
(v dx + u dy). (15.107)
C

Example 15.45. Let us compute complex integrals


Z
z n dz,

(15.108)

of the monomial function f (z) = z n , where n is an integer, along several different curves.
We begin with the case when the integration curve C is the straight line segment along the
real axis connecting the points 1 to 1, which we parametrize by z(t) = t for 1 t 1.
The defining formula (15.106) implies that the complex integral (15.108) reduces to a real
integral:

Z
Z 1
n = 2 k 0 is even
,
0,
n
n
2
t dt =
z dz =

,
n = 2 k + 1 > 0 is odd.
1
C
n+1

If n 1 is negative, then the singularity of the integrand at at the origin prevents the
integral from converging, and so the complex integral is not defined.
Let us evaluate the same complex integral, but now along a parabolic arc P parametrized by
z(t) = t + i (t2 1),
1 t 1.
Note that, as graphed in Figure C3 , the parabola connects the same two points. We again
refer back to the basic definition (15.106) to evaluate the integral, so
Z
Z 1

n
n
z dz =
t + i (t2 1) (1 + 2 i t) dt.
P

We could, at this point, expand the resulting complex polynomial integrand, and then
integrate term by term. A more elegant approach is to recognize that the integrand is an
exact derivative; namely, by the chain rule

n+1

n
d t + i (t2 1)
= t + i (t2 1) (1 + 2 i t),
dt
n+1
3/7/03

685

c 2003

Peter J. Olver

as long as n 6= 1. Therefore, we can use the Fundamental Theorem of Calculus (which


works equally well for real integrals of complex-valued functions), to evaluate

n+1 1
Z
n = 2 k even,
0,
2
t + i (t 1)

2
z n dz =
=

,
1 6= n = 2 k + 1 odd.
n+1
P
t = 1
n+1
Thus, when n 0 is a positive integer, we obtain the same result as before. Interestingly,
in this case the complex integral is well-defined even when n is a negative integer because,
unlike the real line segment, the parabolic path does not go through the singularity of z n
at z = 0. The case n = 1 needs to be done slightly differently. The integration of 1/z
along the parabolic path is left as an exercise for the reader one that requires some
care. We recommend trying the exercise now, and then verifying your answer once we
have become a little more familiar with basic complex integration techniques.
Finally, let us try integrating around a semi-circular arc, again with the same endpoints
1 and 1. If we parametrize the semi-circle S + by z(t) = e i t , 0 t , we find
Z
Z
Z
Z
n
i nt
it
n dz
dt =
z dz =
e
i e dt =
z
i e i (n+1)t dt
dt
+
0
S
0
0

n = 2 k even,

0,
1 e i (n+1)
e i (n+1)t
=
=
=
2

n + 1 t = 0
n+1
,
1 6= n = 2 k + 1 odd.
n+1
This value is the negative of the previous cases but this can be explained by the fact
that the circular arc is oriented to go from 1 to 1 whereas the line segment and parabola
both go from 1 to 1. Just as with line integrals, the direction of the curve determines the
sign of the complex integral; if we reverse direction, replacing t by t, we end up with the
same value as the preceding two complex integrals. Moreover again provided n 6= 1
it does not matter whether we use the upper semicircle or lower semicircle to go from
1 to 1 the result is exactly the same. However, this remark does not apply to the case
n = 1. Integrating along the upper semicircle S + from 1 to 1 yields
Z
Z
dz
i dt = i ,
(15.109)
=
0
S+ z
whereas integrating along the lower semicircle S from 1 to 1 yields the negative
Z
Z
dz
=
i dt = i .
(15.110)
0
S z

Hence, when integrating the function 1/z, it makes a difference which direction we go
around the origin.
Integrating z n for any integer n 6= 1 around an entire circle gives zero irrespective
of the radius. This can be seen as follows. We parametrize a circle of radius r by z(t) = re i t
for 0 t 2 . Then, by the same computation,
2
I
Z 2
Z 2
rn+1 i (n+1)t
n
n i nt
it
n+1 i (n+1)t
e
z dz =
= 0,
(r e )(r i e ) dt =
ir
e
dt =

n+1
C
0
0
t=0
(15.111)
3/7/03

686

c 2003

Peter J. Olver

provided n 6= 1. Here, as in Chapter A, the circle on the integral sign serves to remind
us that we are integrating around a closed curve. The case n = 1 remains special.
Integrating once around the circle in the counter-clockwise direction yields a nonzero result
Z 2
I
dz
=
i dt = 2 i .
(15.112)
0
C z
Let us note that a complex integral does not depend on the particular parametrization
of the curve C. It does, however, depend upon the orientation of the curve: if we traverse
the curve in the reverse direction, then the complex integral changes its sign. Moreover,
if we chop up the curve into two non-overlapping pieces, C = C1 C2 with a common
orientation, then, just as with a line integral (A.39), line integrals can be decomposed into
a sum over the pieces:
Z
Z
Z
Z
f (z) dz,
C = C 1 C2 .
f (z) dz +
f (z) dz =
f (z) dz,
f (z) =
C

C1

C2

(15.113)
For instance, the integral (15.112) of 1/z around the circle is the difference of the individual
semicircular integrals (15.109), (15.110); the lower semicircular integral acquires a negative
sign to switch its orientation to agree with that of the entire circle.
Note: In complex integration theory, a simple closed curve is often referred to as a
contour , and so complex integration is sometimes referred to as contour integration. Unless
explicitly stated, we always go around contours in the counter-clockwise direction.
Further experiments of this type lead us to suspect that complex integrals are usually
path-independent, and hence evaluate to zero around closed contours. One must be careful,
though, as the integral (15.112) makes clear. Path independence, in fact, follows from the
complex version of the Fundamental Theorem of Calculus.
Theorem 15.46. Let f (z) = F 0 (z) be the derivative of a single-valued complex
function on a domain C. Let C be any curve with initial point and final point
. Then
Z
Z
f (z) dz =

F 0 (z) dz = F () F ().

(15.114)

Proof : This follows immediately from the definition (15.106) and the chain rule:
Z

F (z) dz =
C

dz
dt =
F (z(t))
dt
0

b
a

d
F (z(t)) dt = F (z(b)) F (z(a)) = F () F (),
dt

where = z(a) and = z(b) are the endpoints of the curve.

Q.E.D.

For example, when n =


6 1, the function f (z) = z n is the derivative of the single1
valued function F (z) =
z n+1 . Hence
n+1
Z
n+1
n+1
z n dz =

n+1
n+1
C
3/7/03

687

c 2003

Peter J. Olver

whenever C is a curve connecting to . When n < 0, the curve is not allowed to pass
through the origin z = 0, which is a singularity for z n . Our earlier computations are special
cases of this result.
In contrast, the function f (z) = 1/z is the derivative of
log z = log | z | + i ph z,
but the complex logarithm is no longer single-valued on all of C\{0}, and so Theorem 15.46
cannot be applied directly. However, if our curve is contained within a simply connected
subdomain that does not include the origin, 0 6 C, then we can use any single-valued
branch of the logarithm to evaluate the integral
Z
dz
= log log ,
C z
where , are the endpoints of the curve. Since the common multiples of 2 i cancel, the
answer does not depend upon which particular branch of the logarithm is chosen, but we
do need to be consistent in our choice. For example, on the upper semicircle S + of radius
1 going from 1 to 1,
Z
dz
= log(1) log 1 = i ,
S+ z
where we use the branch of log z = log | z | + i ph z with 0 ph z . On the other hand,
if we integrate on the lower semi-circle S going from 1 to 1, we need to adopt a different
branch, say that with ph z 0. With this choice, the integral becomes
Z
dz
= log(1) log 1 = i ,
S z
thus reproducing (15.109), (15.110). Pay particular attention to the different values of
log(1) in the two cases!
The most important consequence of Theorem 15.46 is that, as long as the integrand
f (z) has a single-valued anti-derivative, its complex integral is independent of the path
connecting two points the value only depends on the endpoints of the curve and not
how one gets from point to point .
Theorem 15.47. If f (z) = F 0 (z) for z and C is any closed curve, then
I
f (z) dz = 0.
(15.115)
C

Conversely, if (15.115) holds for all closed curves C contained in the domain of
definition of f (z), then f admits a single-valued complex anti-derivative with F 0 (z) = f (z).
Proof : We have already demonstrated the first statement. As for the second, we
define
Z z
F (z) =
f (z) dz,
z0

3/7/03

688

c 2003

Peter J. Olver

where z0 is any fixed point, and we choose any convenient curve C connecting
z0 to z. (15.115) assures us that the value does not depend on the chosen path. The proof
that this formula does define an anti-derivative of f is left as an exercise, which can be
solved in the same fashion as the case of a real line integral, cf. (21.19).
Q.E.D.
The preceding considerations suggest the following fundamental theorem, due in its
general form to Cauchy. Before stating it, we introduce the convention that a complex
function f (z) will be called analytic on a domain C provided it is analytic at every
point inside and, in addition, is continuous up to and including its boundary . When
is bounded, its boundary consists of one or more simple closed curves. In general,
we orient so that the domain is always on our left hand side. This means that the
outermost boundary curve is traversed in the counter-clockwise direction, but any interior
holes are take on a clockwise orientation. Our convention is depicted in Figure bdy .
Theorem 15.48. If f (z) is analytic on a bounded domain C, then
I
f (z) dz = 0.

(15.116)

Proof : If we apply Greens Theorem A.25 to the two real line integrals in (15.107),
we find

I
ZZ
v
u
u dx v dy =

= 0,

x y

I
ZZ
u v
v dx + u dy =

= 0,
x y

both of which vanish by virtue of the CauchyRiemann equations (15.22).

Q.E.D.

If the domain of definition of our complex function f (z) is simply connected, then, by
definition, the interior of any closed curve C is contained in , and hence Cauchys
Theorem 15.48 implies the path independence of the complex integral within .
Corollary 15.49.
If f (z) is analytic on a simply connected domain C, then its
Z
complex integral
f (z) dz for C is independent of path. In particular,
C

f (z) dz = 0

(15.117)

for any closed curve C .


Remark : This result also admits a converse: a continuous function f (z) that satisfies
(15.117) for all closed curves is necessarily analytic. See [4] for a proof.
We will also require a slight generalization of this result.

This assumes is a connected domain; otherwise, apply the result to its individual connected
components.

3/7/03

689

c 2003

Peter J. Olver

Lemma 15.50. If f (z) is analytic in a domain that contains two simple closed curves
S and C, and the entire region lying between them, then, assuming they are oriented in
the same direction,
I
I
f (z) dz =

f (z) dz.

(15.118)

Proof : If C and S do not cross each other, we let denote the domain contained between
them, so that = C S; see Figure oints . According to Cauchys Theorem 15.48,
H
f (z) = 0. Now, our orientation convention for means that the outer curve, say

C, is traversed in the counter-clockwise direction, while the inner curve S has the opposite, clockwise orientation. Therefore, if we assign both curves the same counter-clockwise
orientation,
I
I
I
0=
f (z) =
f (z) dz
f (z) dz,

proving (15.118).
If the two curves cross, we can construct a nearby curve K that neither crosses,
as in Figure c2 . By the preceding paragraph, each integral is equal to to that over the
third curve,
I
I
I
f (z) dz =
f (z) dz =
f (z) dz,
C

and formula (15.118) remains valid.

Q.E.D.

Example 15.51. Consider the function f (z) = z n where n is an integer . In


(15.111), we already computed

I
0,
n 6= 1,
n
z dz =
(15.119)
2 i ,
n = 1,
C
when C is a circle centered at z = 0. When n 0, Theorem 15.46 implies that the integral
of z n is 0 over any closed curve in the plane. The same applies in the cases n 2
provided the curve does not pass through the singular point z = 0. In particular, the
integral is zero around closed curves encircling the origin, even though z n for n 2 has
a singularity inside the curve and so Cauchys Theorem 15.48 does not apply as stated.
The case n = 1 has particular significance. Here, Lemma 15.50 implies that the
integral is the same as the integral around a circle provided the curve C also goes once
around the origin in a counter-clockwise direction. Thus (15.112) holds for any closed
curve that goes counter-clockwise once around the origin. More generally, if the curve goes
several times around the origin , then
I
dz
= 2k i
(15.120)
C z

When n is fractional or irrational, the integrals are not well-defined owing to the branch
point singularity at the origin.

Such a curve is definitely not simple and must necessarily cross over itself.

3/7/03

690

c 2003

Peter J. Olver

is an integer multiple of 2 i . The integer k is called the winding number of the curve C,
and measures the total number of times C goes around the origin. For instance, if C winds
three times around 0 in a counter-clockwise fashion, then k = 3, while k = 5 indicates
that the curve winds 5 times around 0 in a clockwise direction, as in Figure wind . In
particular, a winding number k = 0 indicates that C is not wrapped around the origin.
For example, if C is viewed as a loop of string wrapped around a pole (the pole of 1/z at
0) then a winding number k = 0 would indicate that the string can be disentangled from
the pole without cutting; nonzero winding numbers would indicate that the string is truly
entangled .
Lemma 15.52. If C is any simple closed curve, and a is any point not lying on C,

I
dz
2 i ,
a inside C
=
(15.121)
0
a outside C.
C za
If a C, then the integral does not converge.

then

Proof : Note that the integrand f (z) = 1/(z a) is analytic everywhere except at
z = a, where it has a simple pole. If a is outside C, then Cauchys Theorem 15.48 applies,
and the integral is zero. On the other hand, if a is inside C, then Lemma 15.50 implies that
the integral is equal to the integral around a circle centered at z = a. The latter integral
can be computed directly by using the parametrization z(t) = a + r e i t for 0 t 2 , as
in (15.112).
Q.E.D.
Example 15.53. Let D C be a closed and connected domain. Let a, b D be
two points in D. Then

I
I
I
dz
dz
1
1

dz =

=0
za zb
C
C za
C zb
for any closed curve C = C \ D lying outside the domain D. This is because, by
connectivity of D, either C contains both points in its interior, in which case both integrals
equal 2 i , or C contains neither point, in which case both integrals are 0. Theorem 15.47
implies that the integrand admits a single-valued anti-derivative on the domain . On the
other hand, each individual term is the derivative of a multiply-valued complex logarithm.
The conclusion is that, even though the individual logarithms are multiply-valued, their
difference
F (z) = log(z a) log(z b)
is a consistent, single-valued complex function on all of = C \ D. There are, in fact,
an infinite number of possible values, differing by integer multiples of 2 i . However,
assigning a value at one point in leads to a consistent and continuous definition on the
entire domain . Again, this requires that D is connected; the conclusion is not true, say,
for the twice-punctured plane C \ { a, b }.

Actually, there are more subtle three-dimensional considerations that come into play, and
even strings with zero winding number cannot be removed from the pole without cutting if they
are linked in some nontrivial manner, cf. [ 74 ].

3/7/03

691

c 2003

Peter J. Olver

We are sometimes interested in estimating the size of a complex integral. The basic
inequality bounds it in terms of an arc length integral.
Proposition 15.54. The modulus of the integral of the complex function f along a
curve C is bounded by the integral of its modulus with respect to arc length:
Z

| f (z) | ds.
(15.122)
f (z) dz

Proof : We begin with a simple lemma about real integrals of complex functions.

Lemma 15.55. If f (t) is a complex-valued function depending on the real variable


a t b, then

Z
Z b

| f (t) | dt.
(15.123)
f (t) dt

a
a
Proof : If

f (t) dt = 0, the inequality is trivial. Otherwise, let = ph

f (t) dt.

Then, using Exercises and ,

Z
"
# Z
Z b
Z b

b
b
i

i
| f (t) | dt,
Re e
f (t) dt
f (t) dt =
f (t) dt = Re e

a
a
a
a
which proves the lemma.

Q.E.D.

To prove the proposition, we write out the complex integral, and use (15.123) as
follows:

Z b
Z

Z
Z b

dz
dz

f (z(t))
f (z) dz =
dt
dt =
| f (z) | ds,
| f (z(t)) |

dt
dt
a

since | dz | = | z | dt = x2 + y 2 dt = ds is the arc length integral element (A.30). Q.E.D.

Corollary 15.56. If C has length L = L(C), and f (z) is an analytic function such
that | f (z) | M for all points z C, then
Z

f (z) dz M L.
(15.124)

Lift and Circulation

In fluid mechanical applications, the complex integral can be assigned an important


physical interpretation. As above, we consider the steady state flow of an incompressible,
irrotational fluid. Let f (z) = u(x, y) i v(x, y) denote the complex velocity corresponding
T
to the real velocity vector v = ( u(x, y), v(x, y) ) at the point (x, y).
As we noted in (15.107), the integral of the complex velocity f (z) along a curve C
can be written as a pair of real line integrals. In the present situation,
Z
Z
Z
Z
f (z) dz =
(u i v)(dx + i dy) =
(u dx + v dy) i
(v dx u dy). (15.125)
C

3/7/03

692

c 2003

Peter J. Olver

According to (A.37), (A.42), the real part is the circulation integral


Z
Z
v dx =
u dx + v dy,
C

while the imaginary part is minus the flux integral


Z
Z
Z
v n ds =
v dx =
C

(15.126)

v dx u dy,

(15.127)

for the associated steady state fluid flow!


If the complex velocity admits a single-valued complex potential
(z) = (z) i (z),

where

0 (z) = f (z)

which is always the case if its domain of definition is simply connected then the
complex integral is independent of path, and one can use the Fundamental Theorem 15.46
to evaluate it:
Z
f (z) dz = () ()
(15.128)
C

for any curve C connecting to . Path independence of the complex integral immediately
reconfirms the path independence of the flux and circulation integrals for irrotational,
incompressible fluid dynamics. The real part of formula (15.128) evaluates the circulation
integral
Z
Z
v dx =
dx = () (),
(15.129)
C

as the difference in the values of the (real) potential at the endpoints , of the curve C.
On the other hand, the imaginary part of formula (15.128) computes the flux integral
Z
Z
v dx =
dx = () (),
(15.130)
C

as the difference in the values of the stream function at the endpoints of the curve. Thus,
the stream function acts as a flux potential for the flow, with the flux being independent
of path. In particular, if C is a closed contour,
I
I
v dx = 0 =
v dx,
(15.131)
C

and so there is no net circulation or flux along any closed curve in this situation.
In aerodynamics, lift is the result of the circulation of the fluid (air) around the body,
[12, 113]. More precisely, let D C be a closed, bounded subset representing the crosssection of a cylindrical body, e.g., an airplane wing. The velocity vector field v of a steady
state flow around the exterior of the body is defined on the domain = C \ D. According
to Blasius Theorem,I the body will experience a net lift if and only if it has nonvanishing
circulation integral
v dx 6= 0, where C is any simple closed contour encircling the
C

body. However, if the complex velocity admits a single-valued complex potential in ,


then (15.131) tells us that the circulation is automatically zero, and so the body cannot
experience any lift!
3/7/03

693

c 2003

Peter J. Olver

Example 15.57. Consider first the flow around a disk, as discussed in Examples
15.18 and 15.41. The Joukowski potential (z) = z + z 1 is a single-valued analytic
function everywhere except the origin z = 0. Therefore, the circulation integral (15.129)
around any contour encircling the disk will vanish, and hence the disk experiences no net
lift. This is more or less evident from the Figure 15.7 graphing the streamlines of the flow;
they are symmetric above and below the disk, and hence there cannot be any net force in
the vertical direction.
Any conformal map will preserve the single-valuedness of the complex potentials, and
hence will preserve the property of having zero circulation. In particular, all the flows past
airfoils constructed in Example 15.43 also admit single-valued potentials, and so also have
zero circulation integral. Such an airplane will not fly, because its wings experience no lift!
Of course, physical airplanes fly, and so there must be some physical assumption we are
neglecting in our treatment of flow past a body. Abandoning incompressibility or irrotationality would take us outside the magical land of complex variable theory, and into the
wilder regions of fully nonlinear partial differential equations of fluid mechanics. Moreover,
although air is slightly compressible, water is, for all practical purposes, incompressible,
and hydrofoils do experience lift when traveling through water.
The only way to introduce lift into the picture is through a (single-valued) complex
velocity with a non-zero circulation integral, and this requires that its complex potential be
multiply-valued. The one function that we know that has such a property is the complex
logarithm
1
(z) = log(a z + b),
whose derivative
0 (z) =
az + b
is single-valued away from the singularity at z = b/a. Thus, we are naturally led to
introduce the family of complex potentials

1
1
z+
i k log z.
(15.132)
k (z) =
2
z
According to Exercise , the coefficient k must be real in order to maintain the no flux
boundary conditions on the unit circle. By (15.125), the circulation is equal to the real
part of the integral of the complex velocity
dk
1
1
ik
= 2
.
(15.133)
dz
2 2z
z
By Cauchys Theorem 15.48 coupled with formula (15.121), if C is a curve going once
around the disk in a counter-clockwise direction, then

I
I
1
ik
1

fk (z) dz =
dz = 2 k.
2 2 z2
z
C
C
fk (z) =

We center the logarithmic singularity at the origin in order to maintain the no flux boundary
conditions on the unit circle. Moreover, Example 15.53 tells us that more than one logarithm in
the potential is redundant, since the difference of any two logarithms is effectively a single-valued
function, and hence contributes nothing to the circulation integral.

3/7/03

694

c 2003

Peter J. Olver

Therefore, when Re k 6= 0, the circulation integral is non-zero, and the cylinder experiences
a net lift. In Figure liftc , the streamlines for the flow corresponding to a few representative
values of k are plotted. Note the asymmetry of the streamlines that accounts for the lift
experienced by the disk.
When we compose the modified lift potentials (15.132) with the Joukowski transformation (15.99), we obtain a complex potential

1
1
1
1
k () = k (z)
when
=
w+
=
az + +
2
w
2
az +
for flow around the corresponding airfoil the image of the unit disk. The conformal
mapping does not affect the value of the complex integrals, and hence, for any k 6= 0, there
is a nonzero circulation around the airfoil under the modified fluid flow. This circulation
is the cause of a net lift on the airfoil, and at last our airplane will fly!
However, there is now a slight embarrassment of riches, since we have now designed
flows around the airfoil with an arbitrary value 2 k for the circulation integral, and hence
having an arbitrary amount of lift! Which of these possible flows most closely realizes
the true physical version with the correct amount of lift? In his 1902 thesis, the German
mathematician Martin Kutta hypothesized that Nature chooses the constant k so as to
keep the velocity of the flow at the trailing edge of the airfoil, namely = 1, to be finite.
With some additional analysis, it turns out that this condition serves to uniquely specify
k, and yields a reasonably good physical approximation to the actual lift of such an airfoil
in flight, provided the tilt or attack angle of the airfoil in the flow is not too large. Further
details, can be found in several references, including [Fluid, 75, 113].

15.6. Cauchys Integral Formulae and the Calculus of Residues.


Cauchys Integral Theorem 15.48 is a remarkably powerful result. It and its consequences underly most important applications of complex integration. The fact that we can
move the contours of complex integrals around freely as long as we do not cross over
singularities of the integrand grants us great flexibility in their evaluation. Moreover,
it leads to a method for evaluating a function and its derivatives through certain contour
integrals.
As a consequence of Cauchys Theorem, the value of a general complex integral around
a closed contour depends only upon the nature of the singularities of the integrand that
happen to lie inside the contour. This observation inspires us to develop a direct method,
known as the calculus of residues, for evaluating such integrals. The residue method
effectively bypasses the Fundamental Theorem of Calculus no antiderivatives are required! Remarkably, the method of residues can even be applied to evaluate certain types
of real, definite integrals, as the final examples in this section shall demonstrate.
Cauchys Integral Formula
The first important consequence of Cauchys Theorem is the justly famous Cauchy
integral formulae. It gives a formula for the value of an analytic function at a point as a
certain contour integral around a closed curve encircling the point. It is worth emphasizing
3/7/03

695

c 2003

Peter J. Olver

that Cauchys formula is not a form of the Fundamental Theorem of Calculus, since we
are reconstructing the function by integration not its antiderivative! Cauchys formula
is a cornerstone of complex analysis. It has no real counterpart, once again underscoring
the profound difference between the complex and real realms.
Theorem 15.58. Let C be a bounded domain with boundary , and let
a . If f (z) is analytic on , then
I
f (z)
1
dz.
(15.134)
f (a) =
2 i z a
Remark : As always, we traverse the boundary curve so that the domain lies on
our left. In most applications, is simply connected, and so = C is a simple closed
curve oriented in the counter-clockwise direction.
Proof : We first prove that the difference quotient
g(z) =

f (z) f (a)
za

is an analytic function on all of . The only problematic point is at z = a where the


denominator vanishes. First, by the definition of complex derivative,
g(a) = lim

za

f (z) f (a)
= f 0 (a)
za

exists and therefore g(z) is well-defined and, in fact, continuous at z = a. Secondly, we


can compute its derivative at z = a directly from the definition:
g(z) g(a)
f (z) f (a) f 0 (a) (z a)
g (a) = lim
= lim
=
za
za
za
(z a)2
0

1
2

f 00 (a),

where we use Taylors Theorem C.1 (or lHopitals rule) to evaluate the final limit. Since
g is differentiable at z = a, it is an analytic function on all of . Thus, we may appeal to
Cauchys Theorem 15.48, and conclude that
I
I
I
I
f (z) f (a)
f (z) dz
dz
0=
dz =
f (a)
g(z) dz =
za

z a
z a
I
f (z) dz
=
2 i f (a).
z a
The second integral was evaluated using (15.121). Rearranging terms completes the proof
of the Cauchy formula.
Q.E.D.
Remark : The proof shows that if a 6 , then the Cauchy integral vanishes:
I
1
f (z)
dz = 0.
2 i z a
Finally, if a , then the integral does not converge.
3/7/03

696

c 2003

Peter J. Olver

Let us see how we can apply this result to evaluate seemingly intractable complex
integrals.
Example 15.59. Suppose that you are asked to evaluate the complex integral
I
ez dz
2
C z 2z 3

where C is a circle of radius 2 centered at the origin. A direct evaluation is not possible,
since the integrand does not have an elementary antiderivative. However, we note that
ez
ez
f (z)
=
=
2
z 2z 3
(z + 1)(z 3)
z+1

where

f (z) =

ez
z3

is analytic in the disk | z | 2 since its only singularity, at z = 3, lies outside the contour
C. Therefore, by Cauchys formula (15.134), we immediately obtain the integral
I
I
f (z) dz
ez dz
i
=
= 2 i f (1) =
.
2
2e
C z+1
C z 2z 3

Path independence implies that the integral has the same value on any other simple closed
contour, provided it is oriented in the usual counter-clockwise direction, encircles the point
z = 1 but not the point z = 3.
If the contour encloses both singularities, then we cannot apply Cauchys formula
directly. However, as we will see, Theorem 15.58 can be adapted in a direct manner to
such situations. This more general result will lead us directly to the calculus of residues,
to be discussed shortly.
Derivatives by Integration
The fact that we can recover values of complex functions by integration is surprising.
Even more amazing is the fact that we can compute derivatives of complex functions by
integration. Let us differentiate both sides of Cauchys formula (15.134) with respect to
a. The integrand in the Cauchy formula is sufficiently nice so as to allow us to bring the
derivative inside the integral sign. Moreover, the derivative of the Cauchy integrand with
respect to a is easily found:

f (z)
f (z)

=
.
a z a
(z a)2
In this manner, we deduce an integral formulae for the derivative of an analytic function:
I
f (z)
1
0
f (a) =
dz,
(15.135)
2 i C (z a)2
where, as before, C is any closed curve that goes once around the point z = a in a counterclockwise direction. Further differentiation yields the general integral formulae
I
n!
f (z)
(n)
f (a) =
dz
(15.136)
2 i C (z a)n
3/7/03

697

c 2003

Peter J. Olver

that expresses the nth order derivative of a complex function in terms of a contour integral.
These remarkable formulae, which again have no counterpart in real function theory,
can be used to prove our earlier claim that an analytic function is infinitely differentiable,
and thereby complete the proof of Theorem 15.9.
Example 15.60. Let us compute the integral
I
I
ez dz
ez dz
=
,
3
2
2
C (z + 1) (z 3)
C z z 5z 3
around the circle of radius 2 centered at the origin. We use (15.135) with
(z 4) ez
f (z) =
.
(z 3)2

ez
,
f (z) =
z3

Since f (z) is analytic inside C, we conclude that


I
I
ez dz
f (z) dz
5 i
=
= 2 i f 0 (1) =
.
3
2
2
8e
C z z 5z 3
C (z + 1)
One application is the following remarkable result due to Liouville, whom we already
met in Section 10.5. It says that the only bounded complex functions are the constants!
Theorem 15.61. If f (z) is defined and analytic and | f (z) | M for all z C, then
f (z) is constant.
Proof : According to Cauchys formula (15.134), for any point a C,
I
f (z)
1
0
dz
,
f (a) =
2 i CR
(z a)2
where we take CR = { | z a | = R } to be a circle of radius R centered at z = a. We then
estimate the complex integral using (15.122), whence
I

I
I
1
1
M
| f (z) |
M
1
f (z)
0
| f (a) | =
ds
ds =
,

dz

2
2
2
2
(z a)
2 CR | z a |
2 CR R
R
CR

since the length of CR is 2 R. Since f (z) is analytic everywhere, we can let R


and conclude that f 0 (a) = 0. But this occurs for all possible points a, and f 0 (z) 0 is
eveywhere zero, which suffices to prove constancy of f (z).
Q.E.D.

One immediate application is a complex analysis proof of the Fundamental Theorem


of Algebra. Gauss first proved this theorem in 1799, and then gave several further proofs;
see [47] for an extensive discussion. Although this is, in essence, a purely algebraic result,
the proof given here relies in an essential way on complex analysis and complex integration.
Theorem 15.62. Every nonconstant (complex or real) polynomial f (z) has a root
z0 C.
3/7/03

698

c 2003

Peter J. Olver

Proof : Suppose
f (z) = an z n + an1 z n1 + + a1 z + a0 6= 0
for all z C. Then we claim that its reciprocal
g(z) =

1
1
=
n
n1
f (z)
an z + an1 z
+ + a 1 z + a0

satisfies the hypotheses of Liouvilles Theorem 15.61, and hence must be constant, in
contradiction to our hypothesis. Therefore, f (z) cannot be zero for all z, and this proves
the result.
To prove the claim, first by our hypothesis that f (z) 6= 0, we immediately conclude
that g(z) is analytic for all z C. Moreover, | f (z) | as | z | ; indeed, writing

an1
a1
a0
n
| f (z) | = | z | an +
+ + n1 + n ,
z
z
z

the first term clearly foes to as | z | , while the second term is bounded for | z | 0.
Therefore,
1
0
as
| z | ,
| g(z) | =
| f (z) |
and this is enough (see Exercise ) to prove that | g(z) | M is bounded for z C. Q.E.D.
Corollary 15.63. Every complex polynomial of degree n can be factored,
f (z) = an (z z1 ) (z z2 ) (z zn )

where a1 , . . . , an are the roots of f (z).


Proof : The Fundamental Theorem 15.62 guarantees that there is at least one point
z1 C where f (z1 ) = 0. Therefore, we can write
f (z) = (z z1 ) g(z)
where g(z) is a polynomial of degree n 1. The proof is completed via a striaghtforward
induction on the degree of the polynomial.
Q.E.D.
The Calculus of Residues
Cauchys Theorem and Integral Formulae provide amazingly versatile tools for evaluating complicated complex integrals. Since one only needs to understand the singularities
of the integrand within the domain of integration, no indefinite integration is needed to
evaluate the integral! With a little more work, we are led to a general method for evaluating contour integrals, known as the Calculus of Residues for reasons that will soon be
clear. Again, these results and methods have no counterpart in real integration theory.
However, the calculus of residues can, even more remarkably, be used to evaluate a large
variety of interesting definite real integrals for which no explicit indefinite integral exists.
The key idea is encapsulated in the following definition.
3/7/03

699

c 2003

Peter J. Olver

Definition 15.64. Let f (z) be an analytic function for all z near, but not equal to
a. The residue of f (z) at the point z = a is defined by the complex integral
I
1
Res f (z) =
f (z) dz.
(15.137)
z =a
2 i C
The contour integral in (15.137) is taken once in a counter-clockwise direction around
any simple, closed curve C that contains a in its interior, as illustrated in Figure residue .
For example, C could be a small circle centered at a. We require that f (z) be analytic
everywhere inside C except at the point z = a. Lemma 15.50 implies that the value of the
residue does not depend on which curve is chosen. The residue is a complex number, and
tells us certain information about the singularity of f (z) at z = a.
The simplest example is the monomial function f (z) = c z n , where c is a complex
constant and n is an integer. According to (15.111)

I
0,
n 6= 1,
1
n
n
Res c z =
c z dz =
(15.138)
z =0
2 i C
c,
n = 1.
Thus, only the exponent n = 1 gives a nonzero residue. The residue singles out the
function 1/z, which, not coincidentally, is the only one with a logarithmic, and multiplyvalued, antiderivative.
Cauchys Theorem 15.48, when applied to the integral in (15.137), implies that if f (z)
is analytic at z = a, then it has zero residue at a. Therefore, all the monomials, including
1/z, have zero resideu at any nonzero point:
Res c z n = 0

z =a

for

a 6= 0.

(15.139)

Since integration is a linear operation, the residue is a linear operator, mapping complex functions to complex numbers; thus,

Res c f (z) = c Res f (z),


Res f (z) + g(z) = Res f (z) + Res g(z),
(15.140)
z =a

z =a

z =a

z =a

z =a

for any complex constant c. Therefore, by linearity, the residue of any finite linear combination
n
X
cm
cm+1
c1
n
f (z) = m + m1 + +
+ c0 + c1 z + + c n z =
ck z k
z
z
z
k = m

of such monomials is equal to


Res f (z) = c1 .

z =0

Thus, the residue effectively picks out the coefficient of the term 1/z in such an expansion.
As we shall shortly see, the same holds true for infinite series of a similar form.
The easiest nontrivial residues to compute are at poles of a function. According to
(15.29), the function f (z) has a simple pole at z = a if
h(z) = (z a) f (z)

(15.141)

is analytic at z = a and h(a) 6= 0.


3/7/03

700

c 2003

Peter J. Olver

Lemma 15.65. If f (z) =

h(z)
has a simple pole at z = a, then Res f (z) = h(a).
z =a
za

Proof : We substitute the formula for f (z) into the definition (15.137), and so
I
I
1
h(z) dz
1
Res f (z) =
= h(a),
f (z) dz =
z =a
2 i C
2 i C z a
by Cauchys formula (15.134).

Q.E.D.

Example 15.66. Consider the function


ez
ez
=
.
f (z) = 2
z 2z 3
(z + 1)(z 3)
From the factorization of the denominator, we see that f (z) has simple pole singularities
at z = 1 and z = 3. The residues are given, respectively, by

1
e3
ez
ez
ez
ez
=

=
=
,
Res
=
.
Res 2
z = 3 z2 2 z 3
z = 1 z 2 z 3
z 3 z = 1
4e
z + 1 z = 3
4
Since f (z) is analytic everywhere else, the residue at any other point is automatically 0.
Recall that a function g(z) is said to have simple zero at z = a provided
g(z) = (z a) k(z)
where k(z) is analytic at z = a and k(a) = g 0 (a) 6= 0. In this case, the reciprocal function
f (z) =

1
1
=
g(z)
(z a) k(z)

has a simple pole at z = a. The residue of the reciprocal is, by Lemma 15.65,
Res f (z) = Res

z =a

z =a

1
1
1
=
= 0
.
(z a) k(z)
k(a)
g (a)

(15.142)

More generally, if f (z) is analytic at the point a, then the ratio f (z)/g(z) has residue
Res

z =a

f (a)
f (z)
= 0
g(z)
g (a)

(15.143)

at a simple zero z = a of g(z).


Example 15.67. As an illustration, let us compute the residue of sec z = 1/ cos z at
the point z = 21 . Note that cos z has a simple zero at z = 12 since its derivative, sin z,
is nonzero there. Thus, according to (15.142),
Res sec z = Res

z = /2

3/7/03

z = /2

701

1
1
=
= 1.
cos z
sin 12
c 2003

Peter J. Olver

The direct computation of the residue using a complex integral (15.137) is slightly
harder, but instructive. For example, we may integrate sec z around a circle of radius 1
centered at 21 , which we parametrize by z(t) = 12 + e i t . According to the definition,
I
Z
Z
dz
2e i t dt
dt
1
1
1
Res sec z =
=
dt =
.
i
t

i
t
z =a
2 i C cos z
2 e + e
1 + e2 i t
We mutliply the numerator and denominator in the latter integrand by 1 + e 2 i t , and use
Eulers formula (3.74) to obtain

Z
1
sin 2 t
Res sec z =
1+ i
dt = 1 .
z =a

1 + cos 2 t
Note that the imaginary part of this integral vanishes because it is the integral of an odd
function over a symmetric interval, cf. Lemma 11.11.
The Residue Theorem
Residues are the ingredients in a general method for computing contour integrals of
analytic functions. The Residue Theorem says that the value of the integral of a complex
function around a closed curve depends only on its residues at the enclosed singularities.
Since the residues can be computed directly from the function, the resulting formula provides an effective mechanism for painless evaluation of complex integrals, without having
to construct any sort of anti-derivative or indefinite integral. Indeed, the residue method
can be employed even when the integrand does not have an anti-derivative that can be
expressed in terms of elementary functions.
Theorem 15.68. Let C be a simple, closed curve, oriented in the counter-clockwise
direction. Suppose f (z) is analytic everywhere inside C except for a finite number of
singularities, a1 , . . . , an . Then
I
1
f (z) dz = Res f (z) + + Res f (z).
(15.144)
z = a1
z = an
2 i C
Proof : We draw a small circle Ci around each singularity ai . We assume the circles
all lie inside the contour C and do not cross each other, so that ai is the only singularity
contained within Ci ; see Figure resC . Definition 15.64 implies that
I
1
f (z) dz,
(15.145)
Res f (z) =
z = ai
2 i Ci
where the line integral is taken in the counter-clockwise direction around C i .
Consider the domain consisting of all points z which lie inside the given curve C, but
outside all the small circles C1 , . . . , Cn ; this is the shaded region in Figure resC . By our
construction, the function f (z) is analytic on , and hence by Cauchys Theorem 15.48,
the integral of f around the boundary is zero. The boundary must be oriented
consistently, so that the domain is always lying on ones left hand side. This means that
the outside contour C should be traversed in a counter-clockwise direction, whereas the
3/7/03

702

c 2003

Peter J. Olver

inside circles Ci are in a clockwise direction. Therefore, the integral around the boundary
of the domain can be broken up into a difference
1
0=
2 i

1
f (z) dz =
2 i
1
=
2 i

I
I

f (z)

f (z)

n
X

i=1
n
X
i=1

1
dz
2 i

f (z) dz
Ci

Res f (z) dz.

z = ai

The minus sign converts the circle integrals to the counterclockwise orientation used in
the definition (15.145) of the residues. Rearranging the final identity leads to the residue
formula (15.144).
Q.E.D.
Example 15.69. Let us use residues to evaaluate the contour integral
I
ez
dz
2
C z 2z 3
where C denotes a circle of radius r centered at the origin. According to Example 15.66,
the integrand has two singularities at 1 and 3, with respective residues 1/(4 e) and
e3 /4. If the radius of the circle is r > 3, then it goes around both singularities, and hence
by the residue formula (15.144)

I
ez dz
1
e3
(e4 1) i
=
2

+
.
=
2
4e
4
2e
C z 2z 3
If the circle has radius 1 < r < 3, then it only encircles the singularity at 1, and hence
I
ez
i
dz
=

.
2
2e
C z 2z 3
If 0 < r < 1, the function has no singularities inside the circle and hence, by Cauchys
Theorem 15.48, the integral is 0. Finally, when r = 1 or r = 3, the contour passes through
a singularity, and the integral does not converge.
Evaluation of Real Integrals
One important and unexpected application of the Residue Theorem 15.68 is to aid in
the evaluation of certain definite real integrals. The interesting fact is that it even applies to
cases in which one is unable to evaluate the corresponding indefinite integral in closed form,
owing to the non-existence of an elementary anti-derivative. Nevertheless, converting the
definite real integral into (part of a) complex contour integral leads to a direct evaluation
via the calculus of residues that sidesteps the difficulties in finding the antiderivative. This
device is indicative of a useful procedure for analyzing standard (meaning analytic) real
functions by passing to their complex counterparts, which can then be tackled by the more
powerful tools of complex analysis.
There are two principal types of real integral for which this technique can be applied,
although numerous variations appear in more extensive treatments of the subject. First,
3/7/03

703

c 2003

Peter J. Olver

a real trigonometric integral of the form


Z 2
I=
F (cos , sin ) d

(15.146)

can often
be evaluated
by converting it into a complex integral around the unit circle

C = | z | = 1 . If we set
z = ei

so

1
= e i ,
z

then
1
e i + e i
=
cos =
2
2

1
z+
z

1
e i e i
=
sin =
2i
2i

1
z
z

(15.147)

Moreover,
dz
.
iz
Therefore, the integral (15.146) can be written in the complex form

I
1
dz
1
1
1
I=
F
.
z+
,
z
2
z
2i
z
iz
C
dz = de i = i e i d = i z d,

and so

d =

(15.148)

(15.149)

If we know that the resulting complex integrand is well-defined and single-valued, except,
possibly, for a finite number of singularities inside the unit circle, then the residue formula (15.144) tells us that the integral can be directly evaluated by adding together its
residues and multiplying by 2 i .
Z 2
d
Example 15.70. We compute the simple example
. We begin by using
2 + cos
0
the substitution (15.149), whence
I
Z 2
I
2 dz
dz
d

= i

=
.
1
1
2
2 + cos
C z + 4z + 1
C iz 2 + 2 z + z
0
The complex integrand has singularities where its denominator vanishes:

z 2 + 4 z + 1 = 0,
so that
z = 2 3 .

Only one of these singularities, namely 2 + 3 lies inside the unit circle. Therefore,
applying (15.143), we find

I
2 dz
2
4
4

.
=
2

Res
=
=
i

2
2
2 z + 4 z = 2+3
3
z = 2+ 3 z + 4 z + 1
C z + 4z + 1

As the student may recall from first year calculus, this particular integral can, in fact,
be done directly via a trigonometric substitution. However, the computations are not
particularly pleasant, and, with practice, the residue method is much simpler. Moreover,
it straightforwardly applies to situations where no elementary antiderivative exists.
3/7/03

704

c 2003

Peter J. Olver

Example 15.71.
A second type of real integral that can often be evaluated by complex residues are
integrals over the entire real line, from to . Here the technique is a little more
subtle, and we sneak up on the integral by using larger and larger closed contours that
coincide with more and more of the real axis. The basic idea is contained in the following
example.
Example 15.72. The problem is to evaluate the real integral
Z
cos x dx
I=
.
1 + x2
0

(15.150)

The corresponding indefinite integral cannot be evaluated in elementary terms, and so we


are forced to rely on the calculus of residues. We begin by noting that the integrand is
even, and hence the integral I = 21 J is one half the integral
Z
cos x dx
J=
2
1 + x
over the entire real line. Moreover, for x real, we can write
eix
cos x
=
Re
,
1 + x2
1 + x2

and hence

J = Re

e i x dx
.
1 + x2

(15.151)

Let CR be the closed contour consisting of a large semicircle of radius R 0, which


we denote by SR , connected at the ends by the real interval R x R, which is plotted
in Figure semicircle . The corresponding contour integral
I
Z R ix
Z
e i z dz
e i z dz
e dx
=
+
(15.152)
2
2
2
CR 1 + z
R 1 + x
SR 1 + z
breaks up into two pieces: the first over the real interval, and the second over the semicircle.
As the radius R , the semicircular contour CR includes more and more of the real
axis, and so the first integral gets closer and closer to our desired integral (15.151). If
we can prove that the second, semicircular integral goes to zero, then we will be able to
evaluate the integral over the real axis by contour integration, and hence by the method of
residues. The fact that the semicircular integral is small is reasonable, since the integrand
(1 + z 2 )1 e i z gets smaller and smaller as | z | provided Re z 0. A rigorous
verification of this fact will appear at the end of the example.
According to the Residue Theorem 15.68, the integral (15.152) is equal to the sum
of all the residues over the singularities of f (z) lying inside the contour C R . Now ez is
analytic everywhere, and so the singularities occur where the denominator vanishes, i.e.,
z 2 = 1, and so are at z = i . Since the semicircle lies in the upper half plane Im z > 0,
only the first singularity z = + i lies inside provided the radius R > 1. To compute the
residue, we use (15.142) to evaluate

eiz
e1
e i z
1
=
=
=
.
Res

2
z= i 1 + z
2z z= i
2i
2ie
3/7/03

705

c 2003

Peter J. Olver

Therefore, by (15.144),
1
2 i

CR

e i z dz
1

=
= ,
2
1+z
2ie
e

whenever R > 1. Thus, assuming the semicircular part of the integral does indeed become
vanishingly small as R , we conclude that
I
Z ix
e i z dz
1

e dx
= lim
= 2 i
= .
2
2
R

2ie
e
CR 1 + z
1 + x
The integral is real because its imaginary part,
Z
sin x dx
= 0,
2
1 + x
is the integral of an odd function which is automatically zero. Consequently,
Z
Z ix
cos x dx
1

e dx
I=
= Re
=
,
2
2
1+x
2
2e
0
1 + x
which is the desired result.
Finally, let us estimate the size of the semicircular integral. The integrand is bounded
by

eiz
1
1

whenever
| z | = R,
Im z 0,
1 + z 2 1 + | z |2 = 1 + R 2
where we are using the fact that
iz
e = ey 1
whenever

z = x + iy

with

y 0.

According to Corollary 15.56, the size of the integral of a complex function is bounded by
its maximum modulus along the curve times the length of the curve. Thus, in our case,

1
R

e i z dz

L(SR ) =
.

2
2
2
1+R
1+R
R
SR 1 + z

Thus, the semicircular integral becomes vanishingly small as the radius of our semicircle
goes to infinity, R . This completes the justification of the method.
Example 15.73. Here the problem is to evaluate the integral
Z
dx
.
4
1 + x

(15.153)

The indefinite integral can, in fact, be done by partial fractions, but, as anyone who has
tried this can tell you, this is not a particularly pleasant task. So, let us try using residues.
Let CR denote the same semicircular contour as in the previous example. The integrand
has pole singularities where the denominator vanishes, i.e., z 4 = 1, and so at the four
fourth roots of 1. These are
1+ i
e i /4 = , ,
2
3/7/03

e3 i /4 =

1 + i

,,
2

706

1 i
e5 i /4 = , ,
2

e7 i /4 =
c 2003

1 i

.
2

Peter J. Olver

Only the first two roots lie inside the semicircular contour CR , provided R > 1. Their
residues can be computed using (15.142):

1
1
e3 i /4
1 i
,
Res
=
=
=
4 z 3 z = e i /4
4
z = e i /4 1 + z 4
4 2

1
1
e9 i /4
1 i
.
Res
=
=
=

4 z3
4
z = e3 i /4 1 + z 4
4 2
3 i /4
z =e

Therefore, by the residue formula (15.144),

I
1 i

1 i
dz
+
= .
= 2 i
4
4 2
4 2
2
CR 1 + z

(15.154)

On the other hand, we can break up the complex integral into an integral along the
real axis and an integral around the semicircle:
I
Z R
Z
dz
dz
dx
=
+
.
4
4
4
CR 1 + z
SR 1 + z
R 1 + x
The first integral goes to the desired real integral as the radius R . On the other
hand, on a large semicircle | z | = R, the integrand 1/(1 + z 4 ) is small:

1
1
1

whenever
| z | = R.
1 + z 4 1 + | z |4 = 1 + R 4
Thus, using Corollary 15.56, the integral around the semicircle can be bounded by

dz

R
0
as
R .

4
4
1+R
R3
SR 1 + z

Thus, as R , the complex integral (15.154) goes to the desired real integral (15.153),
and so
Z

dx
= .
4
2
1 + x
Note that the result is real and positive, as it must be.

3/7/03

707

c 2003

Peter J. Olver

Chapter 16
Dynamics of Planar Media
In this chapter, we continue our ascent of the dimensional ladder for linear systems.
In Chapter 6, we began our journey by analyzing the equilibrium configurations of discrete
systems massspring chains, circuits and structures which are governed by certain
linear algebraic systems. Next, in Chapter 8, we introduced a continuous time variable
to model the dynamical behavior of such discrete systems by associated systems of linear
ordinary differential equations. Chapter 10 began our treatment of continuous media
with the boundary value problems that describe the equilibrium configurations of onedimensional bars, strings and beams. Dynamical motions of one-dimensional media formed
the focus of Chapter 13, leading to two fundamental partial differential equations: the
heat equation describing thermal diffusion, and the wave equation modeling vibrations. In
Chapters 14 and 15, we focussed our attention on the boundary value problems describing
equilibrium of planar bodies plates and membranes with primary emphasis on the
all-important Laplace equation. We now turn to the analysis of the dynamical behavior of
planar bodies, as governed by the two-dimensional versions of the heat and wave equations.
The heat equation describes diffusion of, say, heat energy or population in a homogeneous
two-dimensional domain. The wave equation models small vibrations of two-dimensional
membranes, e.g., drums.
Although the increase in dimension does exact a toll on our analytical prowess, we
have, in fact, already mastered many of the key techniques. When applied to partial differential equation in higher dimensions, the separation of variables method often results
in ordinary differential equations of a non-elementary type. Solutions are expressed in
terms of certain remarkable and important non-elementary functions, including the Bessel
functions in the present chapter, and the Legendre functions, spherical harmonics, and
spherical Bessel functions appearing in three-dimensional problems. These so-called special functions do not appear in elementary calculus, but do play a starring role in more
advanced applications in physics, engineering and mathematics. Most interesting special
functions arise as solutions to certain second order, self-adjoint boundary value problems
of SturmLiouville type. As such, they obey basic orthogonality relations, and thus can be
used in place of the trigonometric sines and cosines that form the foundations of elementary Fourier analysis. Thus, the series solutions of higher dimensional partial differential
equations lead naturally to the study of special function series. In Appendix C, we collect together the required results about the most important classes of special functions,

We use the term dimension to refer to the number of independent space variables in the
system. Time is accorded a special status, and serves to distinguish dynamics from equilibrium.

3/7/03

708

c 2003

Peter J. Olver

including a short presentation of the series approach for solving non-elementary ordinary
differential equations.
We will also derive a multi-dimensional version of the fundamental solution, corresponding to an initial concentrated delta function force. This allows one to use a general
superposition principle to solve the initial value problem. Disappointingly, conformal mappings are not particularly helpful in the dynamical universe.
Numerical methods for solving boundary value and initial value problems are, of
course, essential in all but the simplest situations. The two basic methods finite element
and finite difference have already appeared, and the only new aspect is the (substantial)
complication of working in higher dimensions. Thus, in the interests of brevity, we defer
the discussion of the numerical aspects of multi-dimensional partial differential equations
to more advanced texts, e.g., [nPDE], and student projects outlined in the exercises.
However, the student should be assured that, without knowledge of the qualitative features
based on direct analysis and particular solutions, the design, implementation, and testing
of numerical solution techniques would be severly hampered. Explicit solutions continue
to play an important practical role, both as a guide for constructing numerical algorithms,
as well as a convenient test of their accuracy.

16.1. Diffusion in Planar Media.


The heating of a homogeneous flat plate is modeled by the two-dimensional heat
equation

ut = u = uxx + uyy ,
(16.1)
where = x2 + y2 is the two-dimensional Laplacian operator. The solution u(t, x) =
u(t, x, y) to (16.1) measures the temperature at time t at each point x = (x, y) in the
domain R 2 occupied by the plate. We are assuming that there are no external heat
sources on the interior of our plate, which can be arranged by covering its top and bottom
with insulation. In particular, an equilibrium solution u = u(x, y) does not depend on time
t, so ut = 0, and hence must satisfy the Laplace equation u = 0, which is in accordance
with Chapter 14.
As in the one-dimensional version, the diffusivity coefficient > 0, which measures
the relative speed of diffusion of heat energy throughout the medium, must be positive.
Negative diffusivity results an ill-posed initial value problem, which experiences the same
difficulties as its we saw in the one-dimensional backwards heat equation. The physical
justification of the heat equation model will be discussed in detail shortly.
To uniquely specify the temperature u(t, x, y), we must impose both initial and boundary conditions. As with the equilibrium Laplace equation, the most important are
(a) Dirichlet boundary conditions
u=h

on

(16.2)

where the temperature is fixed on the boundary of the plate.


(b) Neumann boundary conditions that prescribe the heat flux or normal derivative
u
=k
n
3/7/03

709

on

(16.3)
c 2003

Peter J. Olver

with k = 0 corresponding to an insulated boundary.


(c) Mixed boundary conditions, where we impose Dirichlet conditions on part of the
boundary D ( and Neumann conditions on the remainder N = \ D. For
instance, the homogeneous mixed boundary conditions
u=0

on

D,

u
=0
n

on

N,

(16.4)

correspond to insulating part of the boundary and freezing the remainder.


In all cases, the boundary data may depend upon time as well as the specific boundary
point. We further specify the initial temperature of the plate
u(0, x, y) = f (x, y),

(x, y) ,

(16.5)

at an initial time, which for simplicity, we take as t0 = 0. If the domain is bounded


with a boundary that is not too wild (e.g., piecewise smooth), a general theorem, [31],
guarantees the existence of a unique solution u(t, x, y) to any of these self-adjoint initialboundary value problems for all subsequent times t 0. Our practical goal is to both
compute and understand the behavior of this solution in specific situations.
Derivation of the Diffusion Equation
The physical derivation of the two-dimensional (and three-dimensional) heat equation
relies upon the same two basic thremodynamical laws that were used, in Section 13.1, to
derive its one-dimensional version. The first principle is that heat energy tries to flow from
hot to cold in as fast a way as possible. According to Theorem 18.39, the negative gradient
u points in the direction of the steepest decrease in the temperature u at a point, and
so, in an isotropic medium, heat energy will flow in that direction. Therefore, the heat flux
vector w, which measures the magnitude and direction of the flow of heat energy, should
be proportional to the temperature gradient:
w(t, x, y) = (x, y) u.

(16.6)

The scalar quantity (x, y) > 0 measures the thermal conductivity of the material at
position (x, y) . Equation (16.6) is the multi-dimensional counterpart of Fouriers Law
of Cooling, cf. (13.4).
The second principle is that, in the absence of external heat sources, heat can only
enter a region D through its boundary D. (Recall that the plate is insulated above
and below.)
Let (t, x, y) denote the heat energy at each time and point in the domain, so
ZZ

that

(t, x, y) dx dy is the heat energy contained within the region D at time t. The

rate of change of heat energy is equal to the heat flux into the region through its boundary,

We are assuming the material properties of the plate are not changing in time, and, moreover, are not temperature dependent. Changing the latter assumption would lead to a nonlinear
diffusion equation.

3/7/03

710

c 2003

Peter J. Olver

which is the negative of the flux line integral (A.42), namely

w n ds, where, as usual,

n denotes the outwards unit normal to the boundary D. Therefore,


ZZ
I
ZZ

(t, x, y) dx dy =
w n ds =
w dx dy,
t
D
D
D

where we apply the divergence form (A.57) of Greens Theorem to convert the flux into
a double integral. We bring the time derivative inside the integral and collect the terms,
whence

ZZ

+ w dx dy = 0.
(16.7)
t
D
Keep in mind that this integral formula must hold for any subdomain D . Now, the
only way in which an integral of a continuous function can vanish for all subdomains is if
the integrand is identically zero, cf. Exercise . The net result is the basic conservation
law

+w =0
(16.8)
t
relating heat energy and heat flux w.
As in equation (13.1), the heat energy (t, x, y) at each time and point in the domain
is proportional to the temperature,
(t, x, y) = (x, y) u(t, x, y),

where

(x, y) = (x, y) (x, y)

(16.9)

is the product of the density and the heat capacity of the material. Combining this with
the Fourier Law (16.6) and the energy balance equation (16.9) leads to the general twodimensional diffusion equation

u
= u .
t
In full detail, this second order partial differential equation takes the form

u
=
(x, y)
+
(x, y)
.
(x, y)
t
x
x
y
y

(16.10)

(16.11)

In particular, if the body is homogeneous, then both and are constant, and so general
diffusion equation (16.10) reduces to the heat equation (16.1) with thermal diffusivity
=

=
.

(16.12)

The heat and diffusion equations are examples of parabolic partial differential equations, the
terminology being an adaptation of that in Definition 14.1 to partial differential equations
in more than two variables.
Self-Adjoint Formulation
The general diffusion equation (16.10) is in the self-adjoint form
ut = K[ u ] = u.
3/7/03

711

(16.13)
c 2003

Peter J. Olver

The gradient operator maps scalar fields u to vector fields v = u. Its adjoint ,
which goes in the reverse direction, is taken with respect to the weighted inner products
ZZ
ZZ
e ii =
e (x, y) (x, y) dx dy,
hu;u
ei =
u(x, y) u
e(x, y) (x, y) dx dy, hh v ; v
v(x, y) v

(16.14)
between, respectively, scalar and vector fields. A straightforward integration by parts
argument similar to that in Section 14.4 tells us that

1
1
( v1 ) ( v2 )

v = ( v) =
.
(16.15)
+

x
y

Therefore, the right hand side of (16.13) is equal to


K[ u ] = u =

1
( u),

which recovers the preceding formula (16.10). As always, we need to impose suitable homogeneous boundary conditions Dirichlet, Neumann or mixed to ensure the validity
of the integration by parts argument used to establish the adjoint formula (16.15).
In particular, to obtain the heat equation, we take and to be constant, and so
(16.14) reduce, up to a constant factor, to the usual L2 inner products between scalar
and vector fields. In this case, the adjoint of the gradient is, up to a scale factor, minus
the divergence: = , where = /, and the general diffusion equation (16.13)
reduces to the two-dimensional heat equation (16.1).
As we learned in Chapters 8 and 13, a diffusion equation (16.13) has the form of a
gradient flow, decreasing the heat energy
ZZ
2
E[ u ] = k u k =
k u(x, y) k2 (x, y) dx dy
(16.16)

as rapidly as possible. Thus, we expect its solutions to decay rapidly to thermal equilibrium, u u? , defined as a minimum of the energy functional.
Remark : The heat and diffusion equations are also used to model diffusion of populations, e.g., bacteria in a petri dish or wolves in the Canadian Rockies, [biol]. The solution
u(t, x, y) represents the number of individuals near position (x, y) at time t. The diffusion
is caused by random motions of the individuals. Such diffusion processes also appear in
mixing of chemical reagents in solutions, with reactions introducing additional nonlinear
terms that result in the braod class of reactiondiffusion equations, [chem].

16.2. Solution Techniques for Diffusion Equations.


We now discuss basic analytical (as opposed to numerical) solution techniques for the
two-dimensional heat and diffusion equations of the form
ut = K[ u ].
3/7/03

712

(16.17)
c 2003

Peter J. Olver

To start with, we shall restrict our attention to homogeneous boundary conditions. As


in the one-dimensional situation of Chapter 13, the method of separation of variables is
crucial. The separable solutions to any diffusion equation (16.13) are of exponential form
u(t, x, y) = e t v(x, y).

(16.18)

Since the linear operator K = only involves differentiation with respect to the
spatial variables x, y, we find
u
= e t v(x, y),
t

K[ u ] = e t K[ v ].

while

Substituting back into the diffusion equation (16.17) and canceling the exponentials, we
conclude that v(x, y) must be an eigenfunction for the boundary value problem
K[ v ] = v.

(16.19)

The eigenfunction v is also required to satisfy the relevant homogeneous boundary conditions. In the case of the heat equation (16.1), K[ u ] = u, and hence the eigenvalue
equation (16.19) takes the form
2

v
2v
v + v = 0,
or, in detail,

+ 2 + v = 0.
(16.20)
x2
y
This generalization of the Laplace equation is known as the Helmholtz equation, and was
briefly discussed in Example 14.22.
The fact that K is a positive semi-definite linear operator implies that its eigenvalues
are all real and non-negative. We order them in increasing size:
0 1 2 3 ,

with

as

n .

(16.21)

An eigenvalue is repeated according to the number (which is necessarily finite) of linearly


independent eigenfunctions it admits. The problem has a zero eigenvalue, 1 = 0 if and
only if the operator K is not positive definite, i.e., only in the case of pure Neumann
boundary conditions. We refer the interested reader to [31; Chapter V] for the advanced
theoretical details.
Each eigenvalue and eigenfunction pair will produce a separable solution
uk (t, x, y) = e k t vk (x, y)
to the diffusion equation (16.17). The solutions corresponding to positive eigenvalues
are exponentially decaying in time, while a zero eigenvalue, which only occurs in the
positive semi-definite case, produces a constant solution. The general solution to the
homogeneous boundary value problem can then be built up as a linear combination of
these basic solutions, in the form of an eigenfunction series
u(t, x, y) =

ck uk (t, x, y) =

ck e k t vk (x, y),

(16.22)

k=1

k=1

3/7/03

713

c 2003

Peter J. Olver

which is a form of generalized Fourier series. The eigenfunction coefficients c k are prescribed by the initial conditions, which require

ck vk (x, y) = f (x, y).

(16.23)

k=1

Thus, to solve the initial value problem, we need to expand the intial data as a series in
the eigenfunctions for the Helmholtz boundary value problem.
To compute the coefficients ck in the eigenfunction expansion (16.23), we appeal, as
in the case of ordinary Fourier series, to orthogonality. Self-adjointness of the differential
operator K[ u ] implies that the corresponding eigenfunction solutions v 1 (x, y), v2 (x, y), . . .
to (16.19) are automatically orthogonal with respect to the underlying inner product
h vj ; vk i = 0,

j 6= k.

The general argument establishing this result can be found in Theorem 8.21; see also
Exercise . As a consequence, taking the inner product of both sides of (16.23) with the
eigenfunction vk leads to the equation
ck k v k k2 = h f ; v k i

and hence

ck =

h f ; vk i
.
k v k k2

In this manner we recover our standard orthogonality formula (5.7) for expressing elements
of a vector space in terms of an orthogonal basis. For a general diffusion equation, the
orthogonality formula has the explicit form
ZZ
f (x, y) vk (x, y) (x, y) dx dy

ZZ
ck =
,
(16.24)
2
vk (x, y) (x, y) dx dy

where the weighting function (x, y) was defined in (16.9). In the case of the heat equation,
is constant and so can be canceled from both numerator and denominator, leaving the
simpler expression
ZZ
f (x, y) vk (x, y) dx dy
Z
Z
.
(16.25)
ck =
2
vk (x, y) dx dy

The same orthogonality property underlaid our derivation of the Fourier series solution to
the one-dimensional heat equation in Section 13.1.
Under fairly general hypotheses, it can be shown that the eigenfunctions form a complete system, which means that the Fourier series (16.23) will converge (at least in norm)
to the function f (x, y), provided it is not too bizarre. See [31; p. 369] for a proof of the
following general theorem.

If an eigenvalue is repeated, one needs to make sure that one choses an orthogonal basis for
its eigenspace.

3/7/03

714

c 2003

Peter J. Olver

Theorem 16.1. Let be a bounded domain with piecewise smooth boundary. If


f (x, y) is an L2 function on , then its generalized Fourier series (16.23) with coefficients
defined by (16.24) converges in norm to f . Moreover, if f C2 is twice continuously
differentiable, then its generalized Fourier series converges uniformly to f (x, y) for all
(x, y) .
Qualitative Properties
Before tackling simple examples where we find ourselves in a position to construct
explicit formulae for the eigenfunctions and eigenvalues, let us see what the series solution
(16.22) can tell us about general diffusion processes. Based on our experience with the case
of a one-dimensional bar, the final conclusions will not be especially surprising. Indeed,
they also apply, word for word, to diffusion processes in three-dimensional solid bodies. A
reader who prefers to see explicit solution formulae may wish to skip ahead to the following
section, returning here after digesting the solution formulae.
Keep in mind that we are still dealing with the solution to the homogeneous boundary
value problem. The first observation is that all terms in the series solution (16.22), with
the possible exception of a null eigenfunction term that appears in the semi-definite case,
are tending to zero exponentially fast. Since most eigenvalues are large, all the higher
order terms in the series become almost instantaneously negligible, and hence the solution
can be accurately approximated by a finite sum over the first few eigenfunction modes.
As time goes on, more and more of the modes can be neglected, and the solution decays
to thermal equilibrium at an exponentially fast rate. The rate of convergence to thermal
equilibrium is, for most initial data, governed by the smallest positve eigenvalue 1 > 0
for the Helmholtz boundary value problem on the domain.
In the positive definite cases of homogeneous Dirichlet or mixed boundary conditions,
thermal equilibrium is u(t, x, y) 0. Thus, in these cases, the equilibrium temperature is
equal to the boundary temperature even if this temperature is only fixed on a part of
the boundary. The heat dissipates away through the non-insulated part of the boundary.
In the semi-definite Neumann case, corresponding to a completely insulated plate, the
final thermal equilibrium temperature is constant a multiple of the null eigenfunction
solution u0 (t, x, y) = 1 . In this case, the general solution has the form
u(t, x, y) = c0 +

ck e k t vk (x, y),

(16.26)

k=1

where the sum is over the positive eigenmodes, k > 0. Since all the summands are
exponentially decaying, the equilibrium temperature u? = c0 is the same as the constant
term in the eigenfunction expansion. We evaluate this term using the orthogonality formula
(16.24), and so, as t ,
ZZ
f (x, y) (x, y) dx dy
hf ;1i

ZZ
=
,
u(t, x, y) c0 =
k 1 k2
(x, y) dx dy

3/7/03

715

c 2003

Peter J. Olver

which is a weighted average of the initial temperature over the domain. In particular, in
the case of the heat equation, the weighting function is constant, and so the equilibrium
temperature
ZZ
1
u(t, x, y) c0 =
f (x, y) dx dy
(16.27)
area

equals the average or mean initial temperature distribution. In this case, the heat cannot
escape through the boundary, and redistributes itself in a uniform manner over the domain.
As with its one-dimensional form, the planar heat equation has a smoothing effect on
the initial temperature distribution f (x, y). Assume that the eigenfunction coefficients are
uniformly bounded, so | ck | M for some constant M . This will certainly be the case if
f (x, y) is piecewise continuous, but even holds for quite rough initial data, including delta
functions. Then, at any time t > 0 after the initial instant, the coefficients c k e k t in the
eigenfunction series solution (16.22) are exponentially small as k , which is enough to
ensure smoothness o f the solution u(t, x, y) for each t > 0. Therefore, a diffusion equation
immediately smooths out jumps, corners and other discontinuities in the initial data. As
time progresses, the local variations in the solution become less and less, eventually (or,
more accurately, asymptotically) reaching a constant equilibrium state.
For this reason, diffusion processes can be effectively applied to clean and denoise
planar images. In this application, the initial data f (x, y) represents the grey-scale value
of the image at position (x, y), so that 0 f (x, y) 1 with 0 representing black, and 1
representing white. As time progresses, the solution u(t, x, y) represents a more and more
smoothed version of the image. Although this has the effect of removing unwanted noise
from the image, there is also a gradual blurring of the actual features. Thus, the time or
multiscale parameter t needs to be chosen to optimally balance between the two effects
the larger t is the more noise is removed, but the more noticeable the blurring. A
representative illustration appears in Figure im2 . To further suppress undesirable blurring
effects, recent image prosessing filters are based on nonisotropic (and thus nonlinear )
diffusion equations. See Sapiro, [107], for a survey of recent progress in this active field.
Since the forwards heat equation blurs the features in an image, running it backwards
in time should effectively sharpen the image. However, the one-dimensional argument
presented in Section 13.1 tells us that any direct attempt to run the heat flow backwards
immediately leads to difficulties, and the backwards diffusion equation is ill posed. Various
strategies have been proposed to circumvent this mathematical barrier, and thereby design
effective image enhancement algorithms.
Inhomogeneous Boundary Conditions and Forcing
Finally, let us briefly mention how to incorporate inhomogeneous boundary conditions
and external heat sources into the problem. Consider, as a specific example, the forced
heat equation
ut = u + F (x, y),
(x, y) ,
(16.28)

For a general diffision equation, this requires that the functions (x, y) and (x, y) be smooth.

3/7/03

716

c 2003

Peter J. Olver

where F (x, y) represents an unvarying external heat source, subject to inhomogeneous


Dirichlet boundary conditions
u=h

for

(x, y) .

(16.29)

When the external forcing is fixed for all t, we expect the solution to eventually settle
down to an equilibrium configuration: u(t, x, y) us tar(x, y) as t .
To determine the time-independent equilibrium temperature u = u? (x, y), we set
ut = 0 in the differential equation (16.28). We immediately see that equilibrium for this
problem is characterized by the solution to the Poisson equation
u? = F,

(x, y) ,

(16.30)

subject to the same inhomogeneous Dirichlet boundary conditions (16.29). Positive definiteness of the Dirichlet boundary value problem implies that there is a unique equilibrium
solution is , and can be characterized as the sole minimizer of the Dirichlet principle; for
details see Section 14.4.
Once we have determined the equilibrium solution usually through a numerical
approximation we set
u
e(t, x, y) = u(t, x, y) u? (x, y),
so that u
e measures the deviation of the solution u from its eventual equilibrium. By
linearity u
e(t, x, y) satisfies the unforced heat equation subject to homogeneous boundary
conditions:
u
et = e
u,
(x, y) ,
u = 0,
(x, y) .
Therefore, u
e can be expanded in an eigenfunction series (16.22), and hence will decay to
zero, u
e(t, x, y) 0, at a exponentially fast rate governed by the smallest eigenvalue 1
of the corresponding homogeneous Helmholtz boundary value problem. Consequently, the
solution to the forced, inhomogeneous problem
u(t, x, y) = u
e(t, x, y) + u? (x, y) u? (x, y)

will approach thermal equilibrium, namely u? (x, y), at the same exponential rate as the
homogeneous boundary value problem.

16.3. Explicit Solutions for the Heat Equation.


As noted earlier, explicit solution formulae are few and far between. In this section,
we discuss two specific cases where the Helmholtz eigenfunctions can be found in closed
form. The calculations rely on a separation of variables, which is applicable only to a
rather limited restricted class of domains, which include the rectangles and disks that we
discuss in detail here.
Heating of a Rectangle
A homogeneous rectangular plate

R = 0 < x < a, 0 < y < b


3/7/03

717

c 2003

Peter J. Olver

is heated to a prescribed initial temperature u(0, x, y) = f (x, y) and then insulated. The
sides of the plate are held at zero temperature. Our task is to determine how fast the plate
returns to thermal equilibrium.
The temperature u(t, x, y) evolves according to the heat equation
ut = (uxx + uyy ),
subject to homogeneous Dirichlet conditions
u(0, y) = u(a, y) = 0 = u(x, 0) = u(x, b),

0 < x < a,

0 < y < b,

(16.31)

along the boundary of the rectangle. As in (16.18), the basic solutions to the heat equation are obtained from an exponential ansatz u(t, x, y) = e t v(x, y). Substituting this
expressing into the leat equation, we find that the function v(x, y) solves the Helmholtz
eigenvalue problem
(vxx + vyy ) + v = 0,
(x, y) R,
(16.32)
subject to the same homogeneous Dirichlet boundary conditions
v(x, y) = 0,

(x, y) R.

(16.33)

To solve the rectangular Helmholtz eigenvalue problem, we shall, as in (14.13), introduce a further separation of variables, writing
v(x, y) = p(x) q(y)
as the product of functions depending upon the individual Cartesian coordinates. Substituting this ansatz into the Helmholtz equation (16.32), we find
p00 (x) q(y) + p(x) q 00 (y) + p(x) q(y) = 0.
To effect the variable separation, we collect all terms involving x on one side and all terms
involving y on the other side of the equation. This is accomplished by dividing by v = p q
and rearranging the terms; the result is

p00
q 00
=
.
p
q

The left hand side of this equation only depends on x, whereas the right hand side only
depends on y. As argued in Section 14.2, the only way this can occur is if the two sides equal
a common separation constant, denoted by . (The minus sign is for later convenience.)
In this manner, we reduce our partial differential equation to a pair of one-dimensional
eigenvalue problems
p00 + p = 0,

q 00 + ( ) q = 0,

each of which is subject to homogeneous Dirichlet boundary conditions


p(0) = p(a) = 0,
3/7/03

718

q(0) = q(b) = 0.
c 2003

Peter J. Olver

To obtain a nontrivial solution to the Helmholtz equation, we seek nonzero solutions to


these two supplementary eigenvalue problems. The fact that we are dealing with a rectangular domain is critical to the success of this approach.
We have already solved these particular two boundary value problems many times;
see, for instance, equation (13.17). The eigenfunctions are, respectively,
pm (x) = sin

m x
,
a

m = 1, 2, 3, . . . ,

qn (y) = sin

n y
,
b

n = 1, 2, 3, . . . ,

with
n2 2
m2 2
n2 2
m2 2
,

=
,
so
that

=
+
.
a2
b2
a2
b2
Therefore, the separable eigenfunction solutions to the Helmholtz boundary value problem
(16.31), (16.32) have the doubly trigonometric form
=

vm,n (x, y) = sin

n y
m x
sin
,
a
b

(16.34)

with corresponding eigenvalues


m,n

n2 2
m2 2
+
=
=
a2
b2

m2
n2
+
a2
b2

2 .

(16.35)

Each of these corresponds to an exponentially decaying, separable solution

2
n2
m x
n y
m
2
m,n t
+ 2 t sin
sin
um,n (t, x, y) = e
vm,n (x, y) = exp
2
a
b
a
b
(16.36)
to the original heat equation.
Using the fact that the univariate sine functions form a complete system, it is not hard
to prove, [122], that the separable eigenfunction solutions (16.36) are complete, which
means that there are no non-separable eigenfunctions. As a consequence, the general
solution to the initial-boundary value problem can be expressed as a linear combination
u(t, x, y) =

cm,n um,n (t, x, y) =

m,n = 1

cm,n e m,n t vm,n (x, y)

(16.37)

m,n = 1

of our eigenfunction modes. The coefficients cm,n are prescribed by the initial conditions,
which take the form of a double Fourier sine series
f (x, y) = u(0, x, y) =

cm,n vm,n (x, y) =

m,n = 1

m,n = 1

cm,n sin

n y
m x
sin
.
a
b

Self-adjointness of the Laplacian coupled with the boundary conditions implies that
the eigenfunctions vm,n (x, y) are orthogonal with respect to the L2 inner product on the
rectangle, so
Z bZ a
h vk,l ; vm,n i =
vk,l (x, y) vm,n (x, y) dx dy = 0
unless
k = m and l = n.
0

3/7/03

719

c 2003

Peter J. Olver

(The sceptical reader can verify the orthogonality relations directly from the formulae for
the eigenfunctions.) Thus, we can use our usual orthogonality formula (16.25) to compute
the coefficients
Z bZ a
h f ; vm,n i
1
n y
m x
cm,n =
=
sin
dx dy,
(16.38)
f
(x,
y)
sin
k vm,n k2
ab 0 0
a
b
where the formula for the norms of the eigenfunctions
Z bZ a
2
vm,n (x, y)2 dx dy = a b.
k vm,n k =
0

(16.39)

follows from a direct evaluation of the double integral. (Unfortunately, while the orthogonality is automatic, the computation of the norm must inevitably be done by hand.)
The rectangle approaches thermal equilibrium at the rate equl to the smallest eigenvalue,

1
1
(16.40)
+ 2 2 ,
1,1 =
a2
b
which depends upon the reciprocals of the squared lengths of the sides of the rectangle and
the diffusion coefficient. The larger the rectangle, or the smaller the diffusion coefficient,
the smaller 1,1 , and hence slower the return to thermal equilibrium. The higher modes,
with m and n large, decay to zero almost instaneously, and so the solution immediately
behaves like a finite sum over a few low order modes. Assuming that c 1,1 6= 0, the slowest
decaying mode in the Fourier series (16.37) is

1
x
1
y
2
c1,1 u1,1 (t, x, y) = c1,1 exp
+ 2 t sin
sin
.
(16.41)
2
a
b
a
b
Thus, in the long run, the temperature is of one sign, either positive or negative depending
upon the sign of c1,1 , throughout the rectangle. As in the one-dimensional case, this observation is, in fact, indicative of the general phenomenon that the eigenfunction associated
with the smallest positive eigenvalue of the Laplacian is of one sign throughout the domain.
A typical solution is plotted in Figure heatrect
Heating of a Disk
Let us perform a similar analysis for the heating of a circular disk. For simplicity, we
take the diffusion coefficient = 1. We also assume that the disk D = { x2 + y 2 1 } has
radius 1. We shall solve the heat equation on D subject to homogeneous Dirichlet boundary
values of zero temperature at the circular edge D = C. Thus, the full initial-boundary
value problem is
ut = u,
x2 + y 2 < 1,
t > 0,
u(t, x, y) = 0,

x2 + y 2 = 1,

u(0, x, y) = f (x, y),

x2 + y 2 1.

(16.42)

A simple rescaling of space and time can be used to recover the solution for an arbitrary
diffusion coefficient and a disk of arbitrary radius from this particular case; see Exercise .
3/7/03

720

c 2003

Peter J. Olver

Since we are working in a circular domain, we instinctively pass to polar coordinates


(r, ). In view of the polar coordinate formula (14.27) for the Laplace operator, the heat
equation and boundary conditions take the form
2 u 1 u
1 2u
u
=
+
+
,
0 r < 1,
t
r2
r r
r2 2
u(t, 1, ) = 0,
u(0, r, ) = f (r, ),

t > 0,

(16.43)

r 1,

where the solution u(t, r, ) is defined for all 0 r 1 and t 0. Moreover,


u(t, r, + 2 ) = u(t, r, )
must be a 2 periodic function of the angular variable to ensure that it represents a
single-valued function on the entire disk.
To obtain the separable solutions
u(t, r, ) = e t v(r, ),
we need to solve the polar coordinate form of the Helmholtz equation
0 r < 1,

2 v 1 v
1 2v
+
+ 2
+ v = 0,
r2
r r
r 2

0 2 ,

subject to the boundary conditions


v(1, ) = 0,

v(r, + 2) = v(r, ).

(16.44)

(16.45)

We apply a further separation of variables to the polar Helmholtz equation by writing


v(r, ) = p(r) q().

(16.46)

Substituting into (16.44), and then collecting together all terms involving r and all terms
involving , we are led to the pair of ordinary differential equations
r 2 p00 + r p0 + ( r 2 ) p = 0,

q 00 + q = 0,

where is the Helmholtz eigenvalue, and the separation constant. The periodicity
condition (16.45) requires that q() be 2 periodic, and hence
q() = cos m

or

sin m ,

where

= m2 ,

(16.47)

span the eigenspace. Note that when m = 0, there is only one independent periodic
solution, namely q() 1; the second solution, q() = , does not satisfy the periodicity
constraint.
Using the preceding formula for the separation constant, = m2 , the second differential equation for p(r) assumes the form
dp
d2 p
(16.48)
+r
+ ( r 2 m2 ) p = 0,
0 r 1.
2
dr
dr
Ordinarily, one requires two boundary conditions to specify a solution to such a second
order boundary value problem. But our Dirichlet condition, namely p(1) = 0, only specifies
r2

3/7/03

721

c 2003

Peter J. Olver

its value at one of the endpoints: r = 1. The other endpoint, r = 0, is a singular


point for the ordinary differential equation, because the coefficient, r 2 , of the highest
order derivative, p00 , vanishes there. This should remind you of our solution to the Euler
differential equation (14.32) when we solved the Laplace equation on the disk. As there,
we only need that the solution be bounded at r = 0, and hence are led to require
| p(0) | < ,

p(1) = 0.

(16.49)

These singular boundary conditions turn out to be sufficient to distinguish the relevant
eigenfunction solutions to (16.48).
Although (16.48) arises in a variety of applications, this may be the first time that
you have encountered this particular ordinary differential equation. It is not an elementary
equation, and its solutions cannot be written in terms of elementary functions. Nevertheless, owing to its significance in a wide range of physical applications, its solutions have
been extensively studied and are, in a sense, well-known. After some preliminary manipulations we shall summarize the known facts about the solutions, leaving details and proofs
to Appendix C.
To simplify the analysis, we make a preliminary rescaling of the independent variable,
replacing by r by

z = r.
Note that, by the chain rule,
dp dp
=
,
dr
dz

d2 p
d2 p
=

,
dr2
dz 2

and hence
r

dp
dp
=z
,
dr
dz

r2

2
d2 p
2 d p
=
z
.
dr2
dz 2

The net effect is to eliminate the eigenvalue parameter (or, rather, hide it in the change
of variables), so that (16.48) assumes the slightly simpler form
z2

d2 p
dp
+z
+ (z 2 m2 ) p = 0.
2
dz
dz

(16.50)

The ordinary differential equation (16.50) is known as Bessels equation, named after the
early 19th century astronomer Wilhelm Bessel, who used the solutions to solve a problem arising in the study of planetary orbits. The solutions to Bessels equation are an
indispensable tool in applied mathematics, physics and engineering.
The Bessel equation cannot (except in special instances) be solved in terms of elementary functions. The one thing we know for sure is that, as with any second order ordinary
differential equation, there are two linearly independent solutions. However, it turns out
that, up to constant multiple, only one solution remains bounded as z 0. This solution is known as the Bessel function of order m, and is denoted by J m (z). Applying the
general systematic method for finding power series solutions to linear ordinary differential
3/7/03

722

c 2003

Peter J. Olver

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4

0.2

0.2

0.2

10

15

20

10

15

20

-0.2

-0.2

-0.2

-0.4

-0.4

-0.4

J0 (z)

J1 (z)
Figure 16.1.

10

15

20

J2 (z)

Bessel Functions.

equations presented in Appendix C, it can be shown that the Bessel function of order m
has the Taylor expansion
Jm (z) =

k=0

(1)k z m+2 k
2m+2 k k ! (m + k) !

(16.51)

z2
z4
z6
1
+

+
= m
2 m!
4 (m + 1) 32 (m + 1)(m + 2) 384 (m + 1)(m + 2)(m + 3)
z

at the origin z = 0. Verification that this series solves the Bessel equation of order m is a
straightforward exercise. Moreover, a simple application of the ratio test for power series
tells us that the series converges for all (complex) values of z. Indeed, the convergence is
quite rapid when z is of moderate size, and so summing the series is a reasonably effective method for computing the Bessel function Jm (z) although in serious applications
one adopts more sophisticated numerical techniques based on asymptotic expansions and
integral formulae, [3, 96]. Figure 16.1 displays graphs of the first three Bessel functions
for z > 0. Most software packages, both symbolic and numerical, contain routines for
accurately evaluating and graphing Bessel functions.

Reverting back to our original radial coordinate r = z/ , we conclude that every


solution to the radial equation (16.48) which is bounded at r = 0 is a constant multiple

p(r) = Jm
r
(16.52)

of the rescaled Bessel function of order m. So far, we have only dealt with the boundary
condition at the singular point r = 0. The Dirichlet condition at the other end requires

p(1) = Jm
= 0.

Therefore, in order that be a legitimate eigenvalue, must be a root of the mth order
Bessel function Jm .

Remark : We already know, owing to the positive definiteness of the Dirichlet boundary value problem, that the eigenvalues > 0 must be positive, so there is no problem
taking the square root. Indeed, it can be proved that the Bessel functions do not have any
negative roots!
The graphs of Jm (z) strongly indicate, and, indeed, it can be rigorously proved, that
each Bessel function oscillates between positive and negative values as z increases above
3/7/03

723

c 2003

Peter J. Olver

0, with slowly decreasing amplitude. As a consequence, there exists an infinite sequence


of Bessel roots, which we number in the order in which they appear:
Jm (m,n ) = 0,

where

0 < m,1 < m,2 < m,3 <

with

m,n .
(16.53)
It is worth noting that the Bessel functions are not periodic, and their roots are not evenly
spaced.
Owing to their physical importance in a wide range of problems, the Bessel roots have
been extensively tabulated in the literature, cf. [3, 39]. A table of all Bessel roots that are
< 12 in magnitude follows. The rows of the table are indexed by n, the root number, and
the columns by m, the order of the Bessel function.
Table of Bessel Roots m,n
0

2.4048

3.8317

5.1356

6.3802

7.5883

5.5201

7.0156

8.4172

11.0860
..
.

8.6537

4
..
.

11.7920
..
.

10.1730
..
.

11.6200
..
.

11.0650
..
.

9.9361
..
.

9.761
..
.

8.7715
..
.

Remark : According to (16.51),


Jm (0) = 0

for

m > 0,

while

J0 (0) = 1.

However, we do not count 0 as a bona fide Bessel root, since it does not lead to a valid
eigenfunction for the Helmholtz boundary value problem.
Summarizing our progress, the eigenvalues
2
m,n = m,n
,

n = 1, 2, 3, . . . ,

m = 0, 1, 2, . . . ,

(16.54)

of the Bessel boundary value problem (16.48), (16.49) are the squares of the roots of the
Bessel function of order m. The corresponding eigenfunctions are
wm,n (r) = Jm (m,n r) ,

n = 1, 2, 3, . . . ,

m = 0, 1, 2, . . . ,

(16.55)

defined for 0 r 1. Combining (16.55) with the formula (16.47) for the angular components, we conclude that the separable solutions (16.46) to the polar Helmholtz boundary
value problem (16.44) are
v0,n (r, ) = J0 (0,n r),
vm,n (r, ) = Jm (m,n r) cos m ,

3/7/03

vbm,n (r, ) = Jm (m,n r) sin m ,


724

where

n = 1, 2, 3, . . . ,
m = 1, 2, . . . .
c 2003

Peter J. Olver

v0,1 (r, )

v0,2 (r, )

v0,3 (r, )

v1,1 (r, )

v1,2 (r, )

v1,3 (r, )

v2,1 (r, )

v2,2 (r, )

v2,3 (r, )

v3,1 (r, )

v3,2 (r, )

v3,3 (r, )

Figure 16.2.

Fundamental Modes for a Disk.

These solutions define the natural modes of vibration for a disk, and Figure 16.2 plots the
first few of them. The eigenvalues 0,n are simple, and contribute radially symmetric eigenfunctions, whereas the eigenvalues m,n for m > 0 are double, and produce two linearly
independent separable eigenfunctions, with trigonometric dependence on the angular variable. As in the rectangular case, it is possible to prove that the separable eigenfunctions
are complete there are no other eigenfunctions and, moreover, every (reasonable)
function defined on the unit disk can be written as a generalized Fourier series in the
Bessel eigenfunctions.
3/7/03

725

c 2003

Peter J. Olver

We have now produced the basic solutions


2

u0,n (t, r) = e 0,n t J0 (0,n r),

n = 1, 2, 3, . . . ,
m = 1, 2, . . . .

um,n (t, r, ) = e m,n t Jm (m,n r) cos m ,


2

u
bm,n (t, r, ) = e m,n t Jm (m,n r) sin m ,

(16.56)

to the Dirichlet boundary value problem for the heat equation on the unit disk. The
general solution is a linear superposition, in the form of an infinite series

u(t, r, ) =

a0,n u0,n (t, r) +

m,n = 1

n=1

am,n um,n (t, r, ) + bm,n u


bm,n (t, r, ) .

As usual, the coefficients am,n , bm,n are determined by the initial condition, so

u(0, r, ) =

a0,n v0,n (r) +

n=1

m,n = 1

am,n vm,n (r, ) + bm,n vbm,n (r, ) = f (r, ).

Thus, we must expand the initial data into a FourierBessel series, which involves Bessel
functions along with the original Fourier trigonometric functions.
According to Section 16.2, the eigenfunctions are orthogonal with respect to the
standard L2 inner product
hu;vi =

ZZ

u(x, y) v(x, y) dx dy =
D

Z 1Z
0

f (r, ) u(r, ) v(r, ) r d dr


0

on the unit disk. (Note the extra factor of r coming from the polar coordinate form (A.51)
of the area element dx dy = r dr d.) The norms of the FourierBessel functions are given
by the interesting formula
k vm,n k = k vbm,n k =

0
Jm (m,n )

(16.57)

that involves the value of the derivative of the Bessel function at the appropriate Bessel
root. A proof of this formula will be given in Appendix C; see Exercises and . A table
of their numerical values follows; as above, the rows are indexed by n and the columns by
m.

Technically, this follows from general principles except for the two eigenfunctions corresponding to the double eigenvalues, whose orthogonality must be verified by hand.

3/7/03

726

c 2003

Peter J. Olver

Norms of the FourierBessel functions k vm,n k = k vbm,n k


1
2
3
4

0.9202
0.6031
0.4811
0.4120

0.7139
0.5319
0.4426
0.3870

0.6020
0.4810
0.4120
0.3661

0.5287
0.4421
0.3869
0.3482

0.4757
0.4110
0.3658
0.3326

0.4350
0.3854
0.3477
0.3189

0.4026
0.3638
0.3319
0.3067

0.3759
0.3453
0.3180
0.2958

Orthogonality of the eigenfunctions implies that


am,n
bm,n

h f ; vm,n i
1
=
=
2
0
k vm,n k
Jm (m,n )2

h f ; vbm,n i
1
=
=
0 (
2
k vbm,n k2
Jm
m,n )

Z 1Z
0

Z 1Z
0

f (r, ) Jm (m,n r) r cos m d dr,

(16.58)

f (r, ) Jm (m,n r) r sin m d dr.

In accordance with the general theory, each individual solution (16.56) to the heat equation
decays exponentially fast, at a rate prescribed by the square of the corresponding Bessel
2
root m,n = m,n
. In particular, the dominant mode, meaning the one that persists the
longest, is
2
u0,1 (t, r, ) = e 0,1 t J0 (0,1 r).
(16.59)
2
Its decay rate 0,1
5.783 is the square of the first root of the Bessel function J0 (z). This
is the rate at which a disk whose boundary is held at zero temperature approaches thermal equilibrium. The dominant eigenfunction v0,1 (r, ) = J0 (0,1 r) > 0 is strictly positive
within the entire disk and radially symmetric. Consequently, for most initial conditions
(specifically those for which c0,1 6= 0), the disks temperature distribution eventually becomes entirely of one sign and radially symmetric, exponentially decaying to zero at the
rate of slightly less than 6. See Figure 16.3 for a plot of a typical solution, displayed as
succesive times t = 0, .04, .08, .12, .16, .2. Note how, in acoordance with the theory, the
solution almost immediately acquires a radial symmetry, followed by a fairly rapid decay
to thermal equilibrium.

16.4. The Fundamental Solution.


As we learned in Section 13.1, the fundamental solution to the heat equation measures
the temperature distribution resulting from a concentrated initial heat source, e.g., a hot
soldering iron applied instantaneously at one point of the plate. The physical problem is
modeled mathematically by imposing a delta function as the initial condition for the heat
equation, along with homogeneous boundary conditions of the appropriate type. Once we
know the fundamental solution, we will be in a position to recover the solution for arbitrary
initial data by a linear superposition principle.
3/7/03

727

c 2003

Peter J. Olver

Figure 16.3.

Heat Diffusion in a Disk.

As in the one-dimensional case, we shall concentrate on the most tractable case, when
the domain is the entire plane: = R 2 . Our first goal will be to solve the initial value
problem
ut = u,

u(0, x, y) = (x , y ) = (x ) (y ),

(16.60)

for all t > 0 and all (x, y) R 2 . The initial data is a delta function representing a concentrated unit heat source placed at position (, ). The resulting solution u = F (t, x, y; , )
is the fundamental solution for the heat equation on all of R 2 .
The easiest route to the desired solution is the following simple lemma that uses solutions of the one-dimensional heat equation to construct solutions of the two-dimensional
version.
Lemma 16.2. If v(t, x) and w(t, x) are any two solutions to the one-dimensional
heat equation ut = uxx , then the product
u(t, x, y) = v(t, x) w(t, y)

(16.61)

is a solution to the two-dimensional heat equation ut = (uxx + uyy ).


Proof : Our assumptions imply that that vt = vxx , while wt = wyy when we write
w(t, y) as a function of t and y. Therefore, when we differentiate (16.61),
2

v
w
2v
2w
u
u 2u
=
w+v
=
w+v
=
+ 2 ,
t
t
t
x2
y 2
x2
y
and hence u(t, x, y) solves the heat equation.

Q.E.D.

Thus, for example, if


v(t, x) = e
3/7/03

cos x,
728

w(t, y) = e

cos y,
c 2003

Peter J. Olver

are separable solutions of the one-dimensional heat equation, then


u(t, x, y) = e (

+ 2 ) t

cos x cos y

is one of the separable solutions in rectangular coordinates.


A more interesting case is to let
2
1
w(t, y) =
e (y) /4 t ,
2 t

2
1
e (x) /4 t ,
v(t, x) =
2 t

(16.62)

both be the fundamental solutions (13.46) to the one-dimensional heat equation at points
x = and y = , respectively. Multiplying these two solutions together produces the
fundamental solution for the two-dimensional problem.
Proposition 16.3. The fundamental solution to the heat equation ut = u corresponding to a unit delta function placed at position (, ) R 2 at the initial time t0 = 0
is
2
2
1
e [ (x) +(y) ]/4 t .
(16.63)
F (t, x , y ) =
4 t
Proof : since we already know that (16.62) are solutions to the one-dimensional heat
equation, Lemma 16.2 guarantees that u(t, x, y) = v(t, x) w(t, y) as given by (16.63) solves
the planar equation for t > 0. Moreover, at the initial time
u(o, x, y) = v(0, x) w(0, y) = (x ) (y )
is a product of delta functions, and hence the result follows. We note that the total heat
Z
Z

ZZ
u(t, x, y) dx dy =
v(t, x) dx
w(t, y) dy = 1,
t 0,
remains constant, while
lim+ u(t, x, y)

t0

,
0,

(x, y) = (, ),
otherwise.

has the standard delta function limit.

Q.E.D.

As illustrated in Figure 16.4 at times t = .01, .02, .05, .1, the initially concentrated
heat spreads out in a radially symmetric manner. The total amount of heat
ZZ
u(t, x, y) dx dy = 1,
t 0,
remains constant, but at each individual point (x, y), after a slight initial rise, the temperature decays back to zero at a rate proportional to 1/t.
Both the planar fundamental solution and its one-dimensional have a bell-shaped
Gaussian exponential profile. The one difference is the initialfactor. In a one-dimensional
medium, the fundamental solution decays in proportion to 1/ t, whereas in two dimensions
the decay is more rapid, being proportional to 1/t. The physical explanation is that the
energy is able to spread out in two independent directions, and hence diffuse away from its
3/7/03

729

c 2003

Peter J. Olver

Figure 16.4.

Fundamental Solution to the Planar Heat Equation.

initial source faster. As we shall see, the decay in three-dimensional space is more rapid
still, being proportional to t3/2 for similar reasons; see (17.96).
The principal purpose of the fundamental solution is to solve the general initial value
problem. We express the initial temperature distribution as a superposition of delta function sources,
ZZ
u(0, x, y) = f (x, y) =

f (, ) (x , y ) d d,

where, at the point (, ) R 2 , the source has magnitude f (, ). Linearity implies that the
solution is then given by the same superposition of the associated fundamental solutions.
Let us state this result as a theorem.
Theorem 16.4. The solution to the initial value problem
ut = u,

u(t, x, y) = f (x, y),

(x, y) R 2 ,

is given by the linear superposition formula


ZZ
2
2
1
f (, ) e [ (x) +(y) ]/4 t d d
u(t, x, y) =
4 t

(16.64)

We can interpret the solution formula (16.64) as a two-dimensional convolution


u(t, x, y) = F (t, x, y) f (x, y)

(16.65)

of the initial data with a one-parameter family of progressively wider and shorter Gaussian
filters; compare (12.111). As in the one-dimensional version, convolution with such an
integral kernel can be interpreted as a form of weighted averaging of the function, which
has the effect of smoothing out and blurring the initial signal f (x, y).
3/7/03

730

c 2003

Peter J. Olver

Example 16.5.
region, say

If our initial temperature distribution is constant on a circular


u(0, x, y) =

x2 + y 2 < 1,

0,

otherwise,

Then the solution can be evaluated using (16.64), as follows:


ZZ
2
2
1
u(t, x, y) =
e[ (x) +(y) ]/4 t d d,
4 t
D
where the integral is over the unit disk D = { 2 + 2 1 }. Let us evaluate the integral
by going to polar coordinates:
1
u(t, x, y) =
4 t

2 Z 1

e[ (x cos )

+(y sin )2 ]/4 t

d d.

Unfortunately, the final integral cannot be done in closed form in terms of elementary
functions; see Exercise for an expression in terms of complex Bessel functions. On the
other hand, numerical evaluation of the integral is straightforward. A plot of the resulting
radially symmetric solution appears in Figure h2disk .
For more general configurations, where analytical solutions are no longer available,
numerical solutions can be implemented based on a two-dimensional variant of the Crank
Nicholson scheme (13.150), relying on either finite differences or finite elements to discretize
the space coordinates. We will not dwell on the details, but refer the interested reader to
[23, 101, nPDE].

16.5. The Planar Wave Equation.


The second important class of dynamical equations are those governing vibrational
motions. The simplest planar system of this type is the two-dimensional wave equation
2

u 2u
2u
2
2
(16.66)
= c u = c
+ 2 ,
t2
x2
y
which models the free (unforced) vibrations of a uniform two-dimensional membrane (a
drum, say). Here u(t, x, y) represents the displacement of the membrane at time t and position (x, y) , where R 2 is the domain representing the shape of the membrane. The
constant c2 > 0 encapsulates the physical properties of our membrane (density, tension,
stiffness, thickness, etc.), with c being called, as in the one-dimensional version, thewave
speed , since it turns out to be the speed at which localized signals propagate through
the membrane. In this model, we are only allowing small, transverse (meaning vertical)
displacements of the membrane. Large elastic vibrations lead to the nonlinear partial
differential equations of elastodynamics, [56]. The bending vibrations of a flexible plate,
which can be viewed as the two-dimensional version of a beam, are governed by a more
complicated fourth order partial differential equation; see Exercise .
3/7/03

731

c 2003

Peter J. Olver

The solution u(t, x, y) to the wave equation will be uniquely specified once we impose
suitable boundary conditions and initial conditions. The Dirichlet conditions
u=h

on

(16.67)

correpond to gluing our membrane to a fixed boundary or rim. On the other hand, the
homogeneous Neumann conditions
u
=0
n

on

(16.68)

represent a free boundary where the membrane is not attached to any support. Mixed
boundary conditions attach part of the boundary and leave the remaining portion free to
vibrate:
u
(16.69)
=0
on
N,
u=h
on
D,
n
where = D N with D and N non-overlapping. Since the wave equation is second
order in time, we also need to impose two initial conditions:
u
(0, x, y) = g(x, y),
t

u(0, x, y) = f (x, y),

(x, y) .

(16.70)

The first one prescribes the initial displacement of the membrane, while the second prescribes its initial velocity.
The wave equation is the simplest example of a general second order system of Newtonian form
2u
(16.71)
= K[ u ] = u.
2
t
As in (16.15), using general weighted inner products
ZZ
ZZ
e ii =
hu;u
ei =
u(x, y) u
e(x, y) (x, y) dx dy, hh v ; v

e (x, y) (x, y) dx dy,


v(x, y) v

(16.72)

the adjoint to the gradient is a rescaled version of the divergence operator


v =

1
( v).

Therefore, the general Newtonian system (16.71) takes the form


utt = K[ u ] =

1
( v),

or, in full detail,

2u
=
(x, y)
2
t
x

u
(x, y)
x

+
y

u
(x, y)
y

(16.73)

This equation models the small transverse vibrations of a nonuniform membrane, in which
(x, y) > 0 represents the density of the membrane at the point (x, y) , while (x, y) > 0
3/7/03

732

c 2003

Peter J. Olver

represents its stiffness, in direct analogy with the one-dimensional version (13.79). In
particular, if the material is homogeneous, then both and are constant, and (16.73)
reduces to the two-dimensional wave equation (16.66) with wave speed
r

.
(16.74)
c=

As in bars and strings, either increasing the stiffness, or decreasing the density, will cause
the wave speed c to increase, and hence waves (signals) will propagate faster through the
membrane.
Separation of Variables
Unfortunately, there is no explicit analytical technique comparable to the dAlembert
formula (13.117) for solving multi-dimensional wave equations. As a result, we are forced
to fall back on our universal solution tool separation of variables. Initially, the technique
applies equally well to general vibration equations (16.71), and so we shall work in this
context for the time being. The reader can, if desired, immediately specialize to the
wave equation (16.66) itself, as explicit formulae will only be found in this case. We
assume throughout that the boundary conditions Dirichlet, Neumann, or mixed
are homogeneous; see Exercise for a outline of how to handle inhomogeneous boundary
conditions.
As in the one-dimensional analysis from Section 13.3, the separable solutions to the
vibration equation (16.71) are found by using a trigonometric ansatz
u(t, x, y) = cos t v(x, y).

(16.75)

By linearity of K, which does not involve any t differentiation,


utt = 2 cos t v(x, y),

K[ u ] = cos t K[ v ].

Substituting into (16.71), and canceling out the cosine terms, we find that v(x, y) must
satisfy the by now familiar eigenvalue problem
K[ v ] = 2 v = v,

(16.76)

in which v is the eigenfunction whose eigenvalue = 2 is equal to the square of the


vibrational frequency . The eigenfunction v(x, y) is always required to satisfy the relevant
boundary conditions. Specializing to the wave equation (16.66), the eigenvalue problem
(16.76) recues to the same Helmholtz equation
c2 v + v = c2 (vxx + vyy ) + v = 0

(16.77)

that we analyzed earlier in this chapter.


As we learned, in the stable, positive definite cases meaning either Dirichlet or
mixed boundary conditions the operator K admits an infinite sequence of positive
eigenvalues
0 < 1 2 3
3/7/03

with
733

as

k .
c 2003

Peter J. Olver

Each eigenvalue and eigenfunction pair will produce two vibrating solutions
uk (t, x, y) = cos k t vk (x, y) ,
u
ek (t, x, y) = sin k t vk (x, y) ,
(16.78)
p
of frequency k = k . Note that the higher order modes vibrate faster, with progressively
higher frequencies: k as k .
The general solution to the initial value problem can be built up as a quasi-periodic
linear combination
u(t, x, y) =

k=1

ak uk (t, x, y) + bk u
ek (t, x, y) =

k=1

ak cos k t + bk sin k t vk (x, y)

(16.79)
of the fundamental vibrational modes, in the form of an eigenfunction series. The eigenfunction coefficients ak , bk are prescribed by the initial conditions. Thus, evaluating the
solution series (16.79) and its time derivative at the initial time t = 0, we find

ak vk (x, y) = f (x, y),

k=1

k bk vk (x, y) = g(x, y).

(16.80)

k=1

We then appeal to the orthogonality of the eigenfunctions to compute the coefficients


ZZ
ZZ
f vk dx dy
g vk dx dy
1 h g ; vk i
h f ; vk i

ZZ
, (16.81)
= ZZ
,
bk =
=
ak =
k v k k2
k k v k k 2
2
2
k
vk dx dy
vk dx dy

via our usual formulae, (16.24). In the case of the wave equation, the density is constant,
and hence can be canceled from the numerator and denominator of the orthogonality
formulae (16.80). As long as the initial data is reasonably well-behaved, Theorem 16.1 will
justify the convergence of the resulting series solution.
The unstable, semi-definite case of pure Neumann boundary conditions, models a
physical membrane that has not been attached anywhere along its boundary, and so is
free to move off in a vertical direction. Here, the constant solution v 0 (x, y) 1 is a
null eigenfunction, corresponding to the zero eigenvalue 0 = 0. In general, each null
eigenfunction provides two solutions to the vibration equation (16.71), which in the present
situation are the two elementary solutions
u0 (t, x, y) = 1,

u
e0 (t, x, y) = t.

The first solution represents a membrane that has been displaced by a fixed amount in the
vertical direction, while the second represents a membrane that is uniformly moving in the
vertical direction with speed 1. (Think of the membrane moving in outer space unaffected

We are assuming that the series converges sufficiently rapidly in order to be allowed to
differentiate term by term.

3/7/03

734

c 2003

Peter J. Olver

by any external gravitaional force.) The general solution to the vibration equation then
has the series form
u(t, x, y) = a0 + b0 t +

k=1

ak cos k t + bk sin k t vk (x, y).

(16.82)

The coefficients ak , bk for k > 0 are given by the same orthogonality formulae (16.81). The
only unstable, nonperiodic mode is the linearly growing term component b 0 t in (16.82).
Its coefficient
ZZ
g dx dy
hg;1i

Z
Z
,
b0 =
=
k 1 k2
dx dy

is a weighted average of the initial velocity g(x, y) = ut (0, x, y) over the domain. In the
case of the wave equation, the density is constant, and hence
ZZ
1
b0 =
g(x, y) dx dy
area

equals the average initial velocity. If the (weighted) average initial velocity b 0 6= 0 is
nonzero, then the membrane will move off at an average vertical speed b 0 while continuing to vibrate in any of the vibrational modes that have been excited by the initial
displacement and/or initial velocity. Again, this is merely a two-dimensional translation
of our observations of a free, vibrating bar which in turn was the continuum verison of
an unsupported structure.

16.6. Analytical Solutions of the Wave Equation.


So far, we have looked at some of the general, qualitative features of the two-dimensional vibration and wave equations. Actual analytical solutions are, of course, harder to
come by, and can only be found in very special geometrical configurations. In this section,
we discuss the two most important special cases rectangular and circular membranes.
Remark : Most realistic vibration problems need to be solved numerically, by adaptations of the integration schemes outlined in Section 13.5. The spatial discretization is
implemented using either finite differences, or a version of finite elements. We refer the
reader to [nPDE] for details.
Vibration of a Rectangular Drum
Let us first consider the vibrations of a membrane in the shape of a rectangle

R = 0 < x < a, 0 < y < b

with side lengths a and b, whose sides are fixed to the (x, y)plane. Thus, we seek to solve
the wave equation
utt = c2 u = c2 (uxx + uyy ),
3/7/03

735

0 < x < a,

0 < y < b,
c 2003

(16.83)
Peter J. Olver

subject to the initial and boundary conditions


0 < x < a,

u(t, 0, y) = v(t, a, y) = 0 = v(t, x, 0) = v(t, x, b),


u(0, x, y) = f (x, y),

(16.84)

0 < y < b.

ut (0, x, y) = g(x, y),

As we saw in Section 16.3


c2 (vxx + vyy ) + v = 0,

(x, y) R,

(16.85)

on a rectangle, subject to the homogeneous Dirichlet boundary conditions


v(0, y) = v(a, y) = 0 = v(x, 0) = v(x, b),

0 < x < a,

0 < y < b,

(16.86)

(16.87)

are
n y
m x
sin
,
vm,n (x, y) = sin
a
b

where

2 2

m,n = c

n2
m2
+
a2
b2

with m, n = 1, 2, . . . . The fundamental frequencies of vibration are the square roots of the
eigenvalues, so
r
q
m2
n2
(16.88)
m,n = m,n = c
+
.
a2
b2
The frequencies will depend upon the underlying geometry meaning the side lengths
of the rectangle, as well as the wave speed, which is turn is a function of the density
and stiffness of the membrane, (16.74). The higher the wave speed c, or the smaller the
rectangle, the faster the vibrations. In laymans terms (16.89) quantifies the observation
that smaller, stiffer drums of less dense material vibrate faster.
According to (16.78), the normal modes of vibration of our rectangle are
n y
m x
sin
,
a
b
n y
m x
sin
.
u
em,n (t, x, y) = sin m,n t sin
a
b
um,n (t, x, y) = cos m,n t sin

(16.89)

The general solution can be written as a double Fourier series


u(t, x, y) =

m,n = 1

am,n um,n (t, x, y) + bm,n u


em,n (t, x, y) .

in the normal modes. The coefficients am,n , bm,n are fixed by the initial displacement
u(0, x, y) = f (x, y) and the initial velocity ut (0, x, y) = g(x, y) as in (16.80). The orthogonality relations among the eigenfunctions imply
am,n
bm,n

h vm,n ; f i
1
=
=
k vm,n k2
ab

n y
m x
sin
dx dy.
a
b
0
0
Z bZ a
h vm,n ; g i
m x
1
n y

g(x, y) sin
=
=
sin
dx dy.
2
2
2
2
2
m,n k vm,n k
a
b
c m b +n a 0 0

3/7/03

f (x, y) sin

736

c 2003

Peter J. Olver

Figure 16.5.

Vibrations of a Square.

Since the fundamental frequencies are not rational multiples of each other, the general
solution is a genuinuely quasi-periodic superposition of the various normal modes.
In Figure 16.5 we plot the solution for the initial concentrated displacement
u(0, x, y) = f (x, y) = e 100[ (x.5)

+(y.5)2 ]

starting at the center of a unit square, so a = b = 1. The plots are at successive times
0, .02, .04, . . . , 1.6. Note that, unlike a one-dimensional string where a concentrated displacement remains concentrated at all subsequent times and periodically repeats, the initial displacement spreads out in a radially symmetric manner and propagates to the edges
of the rectangle, where it reflects and then interacts with itself. However, owing to the
quasiperiodicity of the solution, the displacement of the drum never exactly repeats itself,
and the initial concentrated signal never quite reforms in the center.
Vibration of a Circular Drum
Let us next analyze the vibrations of a circular membrane with fixed Dirichlet boundary conditions. As always, we build up the solution as a quasi-periodic linear combination
of the normal modes, which, by (16.78), are fixed by the eigenfunctions for the associated
Helmholtz boundary value problem.
As we saw in Section 16.3, the eigenfunctions of the Helmholtz equation on a disk
of radius 1, say, subject to homogeneous Dirichlet boundary conditions, are products of
trigonometric and Bessel functions:
vm,n (r, ) = cos m Jm (m,n r),

3/7/03

vem,n (r, ) = sin m Jm (m,n r),


737

m = 0, 1, 2, . . . ,
n = 1, 2, 3, . . . .
c 2003

(16.90)
Peter J. Olver

Here r, are the usual polar coordinates, while m,n denotes the nth root of the mth order
2
,
Bessel function Jm (z), cf. (16.53). The corresponding eigenvalue is its square, m,n = m,n
and hence the natural frequencies of vibration are the product of the Bessel roots times
the wave speed:
p
m,n = c m,n = c m,n .
(16.91)
A table of their values (for the case c = 1) can be found in the preceding section. The
Bessel roots do not follow any easily discernable order or pattern, and are certainly not
rational multiples of each other. Thus, the vibrations of a circular drum are also truly
quasi-periodic.
The frequencies 0,n = c 0,n correspond to simple eigenvalues, with a single radially
symmetric eigenfunction J0 (0,n r), while the angular modes m,n with m > 0 are double,
each possessing two linearly independent eigenfunctions (16.90). According to the general
formula (16.78), each eigenfunction leads to two independent normal modes of vibration,
having the explicit form
n cos o
n cos o
m Jm (m,n r).
(16.92)
c m,n t
um,n (t, r, ) =
sin
sin
One can use either the cosine or the sine in each slot, and so the formula gives a total of
four distinct normal modes associated with each Bessel function unless m = 0, in which
case the solutions are radially symmetric, and there are only two normal modes for each
eigenvalue. The general solution is written as a series in these normal modes in the form
of a FourierBessel series
X h

am,n cos c m,n t + bm,n sin c m,n t cos m


um,n (t, r, ) =
m,n

+ cm,n cos c m,n t + dm,n sin c m,n t sin m Jm (m,n r).


(16.93)
The coefficients am,n , bm,n , cm,n , dm,n are determined, as usual, by the initial displacement
and velocity of the membrane. In Figure vdisk , the vibrations due to an initially concentrated displacement are displayed. Again, the motion is only quasi-periodic and never
quite returns to the original configuration.
Remark : As we learned in Section 13.3, the natural frequencies of vibration a onedimensional medium, e.g., a violin string or a column of air in a flute, are integer multiples
of each other. As a consequence, the vibrations are periodic. Musically, this means that
the overtones are integer multiples of the fundamental tones, and, as a result the music
sounds harmonic to our ear. On the other hand, the natural frequencies of circular and
rectangular drums are irrationally related, and the vibrations are only quasi-periodic. As a
result, we hear a percussive sound! Thus, for some reason, our musical appreciation is psychologically attuned to the differences between rationally related/periodic and irrationally
related/quasi-periodic vibrations.
Scaling and Symmetry
Both translation and scaling symmetries can be effectively employed in the analysis
3/7/03

738

c 2003

Peter J. Olver

of the wave equation. Let us consider the simultaneous rescaling


t 7 t,

x 7 x,

y 7 y,

(16.94)

of time and space, whose effect is to change the function u(t, x, y) into a rescaled function
U (t, x, y) = u( t, x, y).

(16.95)

The chain rule relates their derivatives:


2
2U
2 u
=

,
t2
t2

2
2U
2 u
=

,
x2
x2

2
2U
2 u
=

.
y 2
y 2

Therefore, if u satisfies the wave equation


utt = c2 u,
then U satisfies the rescaled wave equation
Utt =

2 c2
U = e
c 2 U,
2

where the rescaled wave speed is e


c=

c
.

(16.96)

In particular, rescaling time by setting = 1/c results in a unit wave speed e


c = 1. In other
words, for a given homogeneous medium, we can choose our time unit of measurement to
arrange that the wave speed is equal to 1.
If we set = , scaling space and time in the same proportion, then the wave speed
does not change, e
c = c, and so
t 7 t,

x 7 x,

y 7 y,

(16.97)

defines a symmetry transformation for the wave equation. If u(t, x, y) is any solution to
the wave equation, then so is its rescaled version
U (t, x, y) = u( t, x, y)

(16.98)

for any choice of (nonzero) scale parameter . In other words, if u(t, x, y) is defined on a
domain , then the rescaled solution U (t, x, y) will be defined on the rescaled domain

1
x y
e

= = (x, y)
,
.
(16.99)


For example, if = 21 , then the effect is to double the size of the domain. The fundamental
modes for the rescaled domain have the form
Un (t, x, y) = un ( t, x, y) = cos n t vn ( x, y),
e (t, x, y) = u
U
e ( t, x, y) = sin t v ( x, y),
n

and hence the fundamental vibrational frequencies


e n = n are scaled by the same
overall factor. Thus, when < 1, the rescaled membrane is larger and its vibrations are
slowed down by the same factor. For instance, a drum that is twice as large will vibrate
twice as slowly, and hence have an octave lower overall tone. Musically, this means that all
3/7/03

739

c 2003

Peter J. Olver

drums of a similar shape have the same pattern of overtones, differing only in their overall
pitch, which is a function of their size, tautness and density.
For example, choosing = 1/R will rescale the unit disk into a disk of radius R. The
fundamental frequencies of the rescaled disk are

em,n = m,n =

c
,
R m,n

(16.100)

where c is the wave speed and m,n are the Bessel roots, defined in (16.53). Consequently,
the ratios m,n /m0 ,n0 between vibrational frequencies are the same, independent of the
size of the disk R and the wave speed c. We define the relative vibrational frequencies
m,n =

m,n
m,n
=
,
0,1
0,1

where

0,1 =

c 0,1
c
2.4
R
R

(16.101)

is the dominant, lowest frequency. The relative frequencies m,n are indepedent of the
size, stiffness or composition of the drum membrane. In the following table, we display all
relative vibrational frequencies (16.101) that are < 6. As usual the columns are indexed
by m and the rows by n. Once the lowest frequency 0,1 has been determined either
theoretically, numerically or experimentally all the higher overtones m,n = m,n 0,1
are obtained by multiplication by these fixed relative frequencies of vibration.
Relative Vibrational Frequencies of a Circular Disk
0

...

1.000

1.593

2.136

2.653

3.155

3.647

4.132

2.295

2.917

3.500

4.059

4.601

3.598

4.230

4
..
.

4.903
..
.

5.540
..
.

4.832
..
.

5.412
..
.

5.977
..
.

5.651
..
.

5.553
..
.

5.131
..
.

5.084
..
.

...

4.610
..
.

16.7. Nodal Curves.


When a membrane vibrates, the individual points move up and down in a quasiperiodic manner. As such, correlations between the motions at different points are not
immediately evident. However, if the system is set to vibrate in a pure eigenmode, say
un (t, x, y) = cos(n t) vn (x, y),

p
then all points on the membrane move up and down at a common frequency n = n ,
which is the square root of the eigenvalue corresponding to the eigenfunction v n (x, y). The
3/7/03

740

c 2003

Peter J. Olver

exceptions are the points where the eigenfunction vanishes:


vn (x, y) = 0.

(16.102)

Such points will not move at all. The set of all points (x, y) that satisfy (16.102)
is known as the nth nodal set of the domain . If we scatter small particles (e.g., sand
or powder) over the membrane while it is performing a pure vibration, we can actually
see the nodal set because the particles will, though random movement over the oscillating
regions of the membrane, tend to accumulate along the unmoving nodal curves.
It can be shown that, in general, the nodal set consists of a system of intersecting
curves, known as the nodal curves of the membrane. The nodal curves partition the membrane into nodal regions, and intersect at critical points, vn = 0, of the eigenfunction.
Points lying in a common nodal region all vibrate in tandem, so that all the nodal region
is either up or down, except, momentarily, when the entire membrane has zero displacement. The latter situation occurs at regular time intervals, namely whenever cos n t = 0.
Adjacent nodal regions, lying on the opposite sides of a nodal curve, always vibrate in
opposite directions when one side is up, the other is down, and then, as the membrane
becomes momentarily flat, they simultaneously switch direction.
Example 16.6. Circular Drums. Since the eigenfunctions (16.90) for a disk are
products of trigonometric functions in the angular variable and Bessel functions of the
radius, the nodal curves for the normal modes of vibrations of a circular membrane are
rays emanting from and circles centered at the origin. Thus, the nodal regions of vibration
are annular sectors. Pictures of the nodal curves for the first nine fundamental modes
indexed by their relative frequencies are plotted in Figure 16.6. Figure 16.2 shows a sample
displacement of the membrane in each of the first twelve modes. The dominant (lowest
frequency) mode is the only one that has no nodal curves; it has the form of a radially
symmetric bump where the entire membrane flexes up and down. Every other mode has
at least one nodal curve. For instance, the next lowest modes have frequency 1,1 , and are
linear combinations u1,1 + u
e1,1 of the two eigenfunctions. Each combination has a single
diameter as a nodal curve, whose orientation depends upon the coefficients , . The two
halves of the drum vibrate up and down in opposing directions when the top half is up,
the bottom half is down and vice versa. The next set of modes have two perpendicular
diameters as nodal curves, and the four quadrants of the drum vibrate up and down in
tandem, with adjacent quadrants having opposing displacements. Next is a single mode,
with a circular nodal curve whose (relative) radius 0,2 /0,1 0.6276 is the ratio of the
first two roots of the order zero Bessel function; see Exercise for a justification. In this
case, the inner disk and outer annulus vibrate in opposing directions.
Example 16.7. Rectangular Drums. For a general rectangular drum, the nodal
curves are relatively uninteresting. Since the normal modes (16.89) are separable products
of trigonometric functions in the coordinate variables x, y, the nodal curves are regularly
equi-spaced straight lines parallel to the sides of the rectangle. The internodal regions
are small rectangles, all of the same size and shape, with adjacent rectangles vibrating in
opposite directions.
3/7/03

741

c 2003

Peter J. Olver

1.000

1.593

2.136

2.295

2.653

2.917

3.155

3.500

3.598

Figure 16.6.
Nodal Curves and Relative Frequencies
of Vibration of a Circular Membrane.
A more interesting collection of nodal curves occurs when the rectangle admits multiple eigenvalues so-called accidental degeneracies. If two of the eigenvalues (16.87)
are equal, m,n = k,l , which occurs when
m2
n2
k2
l2
+
=
+
(16.103)
a2
b2
a2
b2
where (m, n) 6= (k, l) are two distinct pairs of positive integers, then both eigenmodes
vibrate with a common frequency
= m,n = k,l .
As a consequence, any linear combination of the eigenmodes

m x
n y
kx
ly
cos t sin
sin
+ sin
sin
,
a
b
a
b
3/7/03

742

, R,
c 2003

Peter J. Olver

Figure 16.7.

Some Nodal Curves for a Square Membrane.

will also qualify as a normal mode of vibration. The corresponding nodal curves
n y
kx
ly
m x
sin
+ sin
sin
=0
(16.104)
a
b
a
b
have a more intriguing geometry. Their configurations can change dramatically as the
relative magnitudes of , vary.

For example, if R = 0 < x < 1, 0 < y < 1 is a unit square, then an accidental
degeneracy, satisfying (16.103), occurs whenever
sin

m2 + n 2 = k 2 + l 2 .

(16.105)

Thus, two distinct ordered pairs of positive integers (m, n) and (k, l) must have the same
norm. The simplest possibility occurs whenever m 6= n, in which case we merely reverse
the order, setting k = n, l = m. In Figure 16.7 we illustrate the nodal curves
sin 4 x sin y + sin x sin 4 y = 0,

= .2, .5, 1,

corresponding to the three different linear combinations of the eigenfunctions


with m =

l = 4, n = k = 1. The associated vibrational frequency is 4,1 = c 17 .


Remark : Classifying the rectangles that admit such accidental degeneracies takes us
into the realm of number theory, [NumTh]. The basic issue is to classify numbers can be
written as a sum of two squared integers in more than one way, as in (16.105). Or, stated
another way, find all integer points that lie on a common circle.

3/7/03

743

c 2003

Peter J. Olver

Chapter 17
Partial Differential Equations in Space
At last we have ascended the dimensional ladder to its ultimate rung (at least for those
of us living in a three-dimensional universe): partial differential equations in physical
space. Fortunately, almost everything of importance has already appeared in the oneand two-dimensional situations, and appending a third dimension is, for the most part,
simply a matter of appropriately adapting the same basic constructions. Thus, separation
of variables, Greens functions and fundamental solutions continue to be the weapons of
choice. Unfortunately, despite the best efforts of mathematicians, the most powerful of
our planar tools, conformal mapping, does not carry over to higher dimensions. The crux
of the problem is the relative lack of conformal maps.
As before, the three primary examples are the three-dimensional Laplace equation,
modeling equilibrium configurations of solid bodies, the three-dimensional heat equation,
which models basic spatial diffusion processes, and the three-dimensional wave equation,
governing small vibrations of solid bodies. Of course, the dimensional ladder continues to
stretch onwards to general four-, five-, and even n-dimensional counterparts of these basic
linear systems. However, almost all important analytical and numerical techniques already
appear by the time we reach three-dimensional space, and such extensions are of interest
primarily to pure mathematicians and, possibly, modern theoretical physicists.
The basic underlying solution techniques separation of variables and Greens functions or fundamental solutions have already appeared. In three-dimensional problems,
separation of variables can be used in rectangular, cylindrical and spherical coordinates.
The first two do not produce anything fundamentally new, and are therefore left to the exercises. The most important case is in spherical coordinates, and here we find new special
functions known as spherical harmonics and spherical Bessel functions. These functions
play important roles in a number of physical systems, including the quantum theor of
atomic structure that underlies the spectroscopic and reactive properties of atoms, and
hence the periodic table and, in a sense, all of modern chemistry.
The fundmental solution for the three-dimensional heat equation can be easily guessed
from its one- and two-dimensional versions. The three-dimensional wave equation, surprisingly, has an explicit solution formula of dAlembert form, albeit quite a bit more complicated. Indeed, attempts to derive such a formaul for the two-dimensional version were
unsuccessful, and only through a method of descent starting with the three-dimensional
solution are we able to arrive at the solution to the two-dimensional wave equation. This
also points o;ut a critical difference between waves in two- and three-dimensional media.
Huygens principle states that three-dimensional waves due to a locallized initial disturbance remain locallized as they propagate in space; this is not true in two dimensions,
3/7/03

744

c 2003

Peter J. Olver

and a concentrated planar disturbance leads to a redisudal disturbance that never entirely
disappears!

17.1. The Laplace and Poisson Equations.


We begin, as always, with systems in equilibrium. The most fundamental system is
the three-dimensional Laplace equation
u = uxx + uyy + uzz = 0,

(17.1)

in which x = ( x, y, z ) represent Cartesian coordinates in R 3 . The solutions to the


Laplace equation continue to be known as harmonic functions. The Laplace equation
models unforced equilibria; Poissons equation is the inhomogeneous version
u = f (x, y, z),

(17.2)

where the inhomogeneity f represents some form of external forcing.


The basic boundary value problem for the Laplace or the Poisson equation seeks a
solution inside a bounded domain R 3 subject to either Dirichlet boundary conditions,
prescribing the function values
u=h

on

(17.3)

or Neumann boundary conditions prescribing its normal derivative


u
=k
on
,
(17.4)
n
or mixed boundary conditions in which one imposes Dirichlet conditions on part of the
boundary and Neumann conditions on the remainder. Keep in mind that the boundary of
the domain consists of one or more surfaces, which will be oriented using unit normal n
pointing outwards, away from the domain.
The boundary value problems for the three-dimensional Laplace and Poisson equations
govern a wide variety of equilibrium situations in physics. Among the areas of application,
we mention:
(a) Ideal fluid flow : Here u represents the velocity potential for an incompressible, irrotational steady state fluid flow in a container, with velocity vector field v = u.
Homogeneous Neumann boundary conditions correspond to a solid boundary which
fluid cannot penetrate.
(b) Heat conduction: Here u represents the temperature in a solid body. Dirichlet conditions correspond to fixing the temperature on the bounding surface(s), whereas
homogeneous Neumann conditions correspond to an insulated boundary, i.e., one
which does not allow any heat flux. The inhomogeneity f represents an internal
heat source.
(c) Elasticity: In certain restricted situations, u represents an equilibrium deformation of
a solid body, e.g., a radial deformation of a ball. Fully three-dimensional elasticity
is governed by a system of partial differential equations; see Example 20.8.
3/7/03

745

c 2003

Peter J. Olver

(d) Electrostatics: Here u represents the electromagnetic potential in a conducting medium.


(e) Gravitation: The Newtonian gravitational potential in flat empty space is also prescribed by the Laplace equation. General relativity is a vastly more complicated
system, leading to systems of nonlinear partial differential equations.
SelfAdjoint Formulation and Minimum Principles
The Laplace and Poisson equations naturally fit into our self-adjoint equilibrium
framework. The construction is a straightforward adaptation of the planar version of
Section 14.4. We introduce the L2 inner products
ZZZ
hu;u
ei =
u(x, y, z) u
e(x, y, z) dx dy dz,

ZZZ
(17.5)
ei =
e (x, y, z) dx dy dz,
hv;v
v(x, y, z) v

e defined on a domain R 3 . We
between scalar fields u, u
e, and between vector fields v, v
assume that the functions in question are sufficiently nice that these inner products are
well-defined; if is unbounded, this requires that they decay to zero reasonably rapidly at
large distances. When subject to homogeneous boundary conditions of the proper form,
the adjoint of the gradient operator with respect to the L2 inner products is minus the
divergence:
= .
(17.6)
As we have learned, the computation of the adjoint relies on an integration by parts formula. In the plane, Greens Formula (A.55) provided the basic tool. For partial differential
equations in three-dimensional space, we rely on the Divergence Theorem B.36. The first
step is to estalish the three-dimensional analog of Greens Formula (14.78). To this end,
we apply the divergence equation (B.82) to the product u v of a scalar field u and a vector
field v, leading to the identity
ZZZ
ZZ
ZZZ

u v + u v dx dy dz =
(u v) dx dy dz =
u (v n) dS. (17.7)

Rearranging the terms in this formula produces an integration by parts formula for volume
integrals:
ZZZ
ZZ
ZZZ
(u v) dx dy dz =
u (v n) dS
u ( v) dx dy dz.
(17.8)

Note that the gradient operator on the scalar field u has moved to become a divergence
operator on the vector field v. The boundary integral will vanish provided either
(a) u vanishes on these are homogeneous Dirichlet boundary conditions, or
(b) v n = 0 on which leads to homogeneous Neumann boundary conditions
u = 0 on , since the gradient operator must map u to a vector field v = u
whose normal component v n == u/n equals the normal derivative of u, or
(c) = D N decomposes into two non-overlapping parts, and we impose Dirichlet
conditions u = 0 on D and Neumann conditions v = 0 on the remaining part N ,
leading to the usual mixed boundary conditions.
3/7/03

746

c 2003

Peter J. Olver

Thus, subject to the homogeneous boundary conditions, the integration by parts formula (17.8) takes the form
h u ; v i = h u ; v i,
(17.9)
which proves (17.6). Consequently, the Laplace equation takes our well-known self-adjoint
form
(17.10)
u = u = u,
Using more general weighted inner products leads to a more general elliptic boundary value
problem; see Exercise .
As before, (17.10) implies that the Laplacian is positive semi-definite, and positive
definite provided ker = {0}. Since, on a connected domain, only constant functions
are annihilated by the gradient operator, the Dirichlet and mixed boundary conditions
lead to positive definite boundary value problems, while the Neumann boundary value
problem is only semi-definite. As a result, the solution to the boundary value problem can
be characterized by the three-dimensional version of the Dirichlet minimization principle
(14.91).
Theorem 17.1. The solution u(x, y, z) to the Poisson equation (17.2) subject to
Dirichlet boundary conditions (17.3) is characterized as the unique function that minimizes
the Dirichlet integral
ZZZ

1 2
2
2
2
1
(17.11)
2 k u k h u ; f i =
2 (ux + uy + uz ) f u dx dy dz

among all C1 functions that satisfy the prescribed boundary conditions.

The same argument as in Section 14.4 shows that the same minimization principle
applies to solutiont to the inhomogeneous Dirichlet boundary value problem. For mixed
boundary conditions, one must append an additional boundary integral, and the solution
minimize the modified Dirichlet integeral
ZZ
ZZZ

1 2
2
2
u k dS,
(17.12)
2 (ux + uy + uz ) f u dx dy dz +
N

where N is the Neumann part of the boundary. Details are relegated to the exercises. The minimization principle forms the foundation of the three-dimensional finite
element method for constructing numerical solutions to the boundary value problem; see
[101, num3] for details.

17.2. Separation of Variables.


Even in higher dimensions, separation of variables remains the workhorse of explicit
solution methods for linear partial differential equations. As always, the technique is necessarily restricted to rather specific geometrical configurations. In three-dimensional space,
the simplest are problems formulated on rectangular, cylindrical or spherical domains. See
[87, 90, 92] for details on the more exotic types of separable coordinate systems, including
ellipsoidal, toroidal, parabolic spheroidal, and so on.
3/7/03

747

c 2003

Peter J. Olver

The simplest
domain to which the separation of variables method applies is a rectangular box, R = 0 < x < a, 0 < y < b, 0 < z < c A complete separation of variables
ansatz u(x, y, z) = v(x) w(y) q(z) lead to a computation that is almost identical to the
two-dimensional version. The details of the resulting Fourier series solution are left to the
reader; see Exercise .
In the case when the domain is a cylinder, one passes to cylindrical coordinates to
effect the separation. The solution can be written in terms of trigonometric functions and
Bessel functions, with the details being outlined in Exercise . The most interesting case
is that of a solid sphere, and this case will be developed in some detail.
Laplaces Equation in a Ball
Suppose we are given a solid spherical ball (e.g., the earth), with a specified temperature distribution on its boundary. The problem is to determine the equilibrium temperature
within the ball. To simplify matters, we shall choose units in which the radius of the ball
is equal to 1. Therefore, we must solve the Dirichlet boundary value problem
2u 2u 2u
+ 2 + 2 = 0,
x2
y
z
u(x, y, z) = h(x, y, z),

x2 + y 2 + z 2 < 1,
2

(17.13)

x + y + z = 1.

Problems in spherical geometries usually simplify when re-expressed in terms of spherical


coordinates r, , , as defined by
x = r sin cos ,

y = r sin sin ,

z = r cos .

In these coordinates, the Laplace equation takes the form


u =

2 u 2 u
1 2u
cos u
1
2u
+
+
+
+
= 0.
r2
r r
r2 2
r2 sin r2 sin2 2

(17.14)

The derivation of this important formula is the final result of a fairly nasty chain rule
computation, and is left to the reader to verify. (Set aside lots of paper and keep an eraser
handy!)
To solve the spherical coordinate form of the Laplace equation, we begin by separating
off the radial part of the solution, using the separation of variables ansatz
u(r, , ) = w(r) (, ).

(17.15)

Substituting into (17.14), dividing the resulting equation through by the product w and
placing all the terms involving r on one side of the equation yields
[]
r2 d2 w 2r dw
+
= S
= ,
2
w dr
w dr

where
S [ ] =
3/7/03

(17.16)

1 2
2
+
cot

+
.
2
sin2 2
748

(17.17)
c 2003

Peter J. Olver

The second order differential operator S , which contains only the angular components of
the Laplacian operator, is of particular significance. It is known as the spherical Laplacian,
and governs the equilibrium and dynamics of thin spherical shells, as discussed below.
Returning to the radially separated form (17.16) of the Laplace equation, our usual
separation argument works. The left hand side depends only on r, while the right hand side
depends only on , . This can only occur if both sides are equal to a common separation
constant, denoted by in the equation. As a consequence, the radial component w(r)
satisfies the ordinary differential equation
r2 w00 + 2 r w 0 w = 0,

(17.18)

which is of Euler type (3.74). We will put this equation aside to solve later. The angular
components in (17.16) assume the form
S [ ] + = 0,

or, explicitly,

2 cos
1 2
+
+ = 0. (17.19)
+
2 sin sin2 2

This second order partial differential equation constitutes the eigenvalue equation for the
spherical Laplacian, and is known as the spherical Helmholtz equation.
To solve the spherical Helmholtz equation, we adopt a further separation of angular
variables,
(, ) = p() q(),
(17.20)
which we substitute into (17.19). Dividing the result by the product p q, and then rearranging terms, we are led to a second separated system
sin2 d2 p
cos sin dp
1 d2 q
2
+
+

sin

= .
p d2
p
d
q d2
The left hand side depends only on while the right hand side depends only on , so
the two sides must equal a common separation constant, denoted by . The spherical
Helmholtz equation then splits into a pair of ordinary differential equations
sin2

dp
d2 p
+ cos sin
+ ( sin2 ) p = 0,
2
d
d

d2 q
+ q = 0.
d2

(17.21)

The equation for q() is easy to solve. Since the meridial angle varies from 0 to 2 , the
function q() must be a 2 periodic function. Thus, we are reduced to solving the usual
periodic boundary value problem for q(); see, for instance, (14.30). The eigenvalue or
separation constant takes on the values = m2 , where m = 0, 1, 2, . . . is an integer, and
q() = cos m

or

sin m ,

m = 0, 1, 2, . . . ,

(17.22)

are the required eigenfunctions. Each positive = m2 > 0 admits two linearly independent
eigenfunctions, while the = 0 only admits the constant eigenfunction q() 1.
With this information, we next solve the equation for the azimuthal component p().
This is not an elementary differential equation, and finding the solutions requires some
work. The reasoning behind the following steps may not be immediately apparent to the
3/7/03

749

c 2003

Peter J. Olver

reader, since it is the result of a long, detailed study of this important differential equation
by mathematicians.
First, let us eliminate the trigonometric functions. To this end, we use the change of
variables
t = cos ,
p() = P (cos ) = P (t).
(17.23)
According to the chain rule,
p
dP
dp
dP
= sin
= 1 t2
,
d
dt
dt
2
d2 p
d2 P
dP
dP
2
2 d P
=
sin

cos

t
=
(1

t
)
.
d2
dt2
dt
dt2
dt
Substituting these expressions into the first equation in (17.21) and using the fact that
= m2 , we conclude that P (t) must satsify the differential equation
(1 t2 )2

d2 P
2 dP
2
2

2
t
(1

t
)
P = 0.
+

(1

t
)

m
dt2
dt

(17.24)

Unfortunately, this differential equation is still not easy to solve, but at least its coefficients
are polynomials. Equation (17.24) is known as the Legendre differential equation of order
m, and its solutions are known as Legendre functions, since they were first used by Legendre
to analyze the gravitational attraction of ellipsoidal bodies.
While the general solution to the Legendre equation requires a new type of special
function, the solutions we are actually interested in can all be written in terms of elementary
algebraic functions. First of all, since t = cos , the solution only needs to be defined on
the interval 1 t 1. The endpoints of this interval, t = 1, correspond to the north
pole, = 0 and the south pole, = , of the sphere. Both endpoints are singular points
for the Legendre equation since the coefficient (1 t2 )2 of the leading order derivative
vanishes when t = 1. Since ultimately we need the separable solution (17.15) to be a
well-defined function of x, y, z (even at points where the spherical coordinates degenerate,
i.e., on the z axis), we need p() to be well-defined at = 0, , and this requires P (t)
to be bounded at the singular points t = 1. As we learned in our study of the Bessel
equation, merely requiring the solution of an ordinary differential equation to be bounded
at a singular point can serve as a legitimate boundary condition and serve to distinguish
the relevant solutions. Requiring the solution to be bounded at both endpoints is even
more restrictive:
| P ( 1) | < ,
| P (+ 1) | < .
(17.25)
It turns out that this occurs only for very special values of the separation constant .
We will justify the following statements in Appendix C. Consider first the case m = 0.
In this case, it turns out that the eigenfunctions, i.e., solutions to the Legendre boundary
value problem (17.24), (17.25), are the Legendre polynomials
Pn (t) =
3/7/03

dn 2
1
(t 1)n
2n n! dtn
750

(17.26)
c 2003

Peter J. Olver

that we already encountered in Chapter 5. Equation (5.44) contains explicit formulas for
the first few Legendre polynomials. Indeed, we now finally comprehend the reason for
the orthogonality of the Legendre polynomials. They are the common eigenfunctions of a
self-adjoint boundary value problem! Their orthogonality is a consequence of the general
theorem on eigenvectors or eigenfunctions of self-adjoint linear operators, and is discussed
in detail in Exercise .
For general m > 0, the eigenfunctions of the Legendre boundary value problem
(17.24), (17.25) are not always polynomials. They are known as the associated Legendre functions, and can be constructed using the explicit formula
Pnm (t) = (1 t2 )m/2

(1 t2 )m/2 dn+m 2
dm
P
(t)
=
(t 1)n ,
dtm n
2n n!
dtn+m

n = m, m + 1, . . . .
(17.27)

Here is a list of the first few associated Legendre functions:


P00 (t) = 1,
P11 (t) =

P10 (t) = t,

P20 (t) =

1 t2 ,
p
P21 (t) = 3 t 1 t2 ,
P30 (t) =

5 3
2 t

3
2

35 4
8 t

t,

15 2
4 t

1
2,

P22 (t) = 3 (t2 1),


p
P31 (t) = 32 1 t2 (5 t2 1),
3/2

P33 (t) = 15 (1 t2 ) ,
p
P41 (t) = 52 1 t2 (7 t3 3 t),

P32 (t) = 15 (t3 t),

P40 (t) =

3 2
2 t

+ 38 ,

4
2
P42 (t) = 15
2 (7 t 8 t + 1),

P43 (t) = 105 t (1 t2 )

P44 (t) = 105 (t4 2 t2 + 1).

3/2

(17.28)

When m = 2 k n is even, Pnm (t)is a polynomial function, while when m = 2 k + 1 n


is odd, it has an extra factor of 1 t2 multiplying a polynomial. Keep in mind that
the square root is real and positive since we are restricting our attention to the interval
1 t 1. If m > n then the formula (17.27) yields zero.
Graphs of the Legendre polynomials Pn (t) = Pn0 (t) can be found in Figure 5.2. In addition, Figure 17.1 displays the graphs of the associated Legendre functions P 21 (t), . . . , P44 (t).
(The graph of P11 (t) is omitted since it is merely a semi-circle.) Pay particular attention
to the fact that the graphs have quite different vertical scales.
Theorem 17.2. Let m 0 be a non-negative integer. Then the eigenfunctions for
the mth order Legendre boundary value problem prescribed by (17.24), (17.25) are the
associated Legendre functions Pnm (t) for n = 0, 1, 2, . . . . The corresponding eigenvalues
are n = n(n + 1).
Returning to the original azimuthal variable , we discover that the boundary value
problem
sin2

dp
d2 p
+ cos sin
+ sin2 p m2 p = 0,
2
d
d

3/7/03

751

| p(0) |, | p() | < , (17.29)


c 2003

Peter J. Olver

1.5

2.5

0.5

0.5

1.5

-0.5

0.5

-0.5

-1

1.5
-1

0.5

0.5

-0.5
1

-0.5

-1
0.5

-1

-1.5
-1

-1.5

P21 (t)

-0.5

0.5

-2

P22 (t)

-1

-0.5

P31 (t)
0.5

1
2

-2

-4

2
-6
-1

-0.5

0.5

-8

-2

-10

-4

-12

-0.5

-1

-1
-2

-14
-6

P32 (t)

P33 (t)

10

P41 (t)

30

100

20

80

7.5
5
10

60

2.5
-1
-1

-0.5

0.5

-0.5

0.5

40

-10

-2.5

20

-20
-5
-30

-7.5

P42 (t)

-1

P43 (t)

Figure 17.1.

-0.5

0.5

P44 (t)

Associated Legendre Functions.

has eigenvalues and eigenfunctions


n = n(n + 1),

m
pm
n () = Pn (cos ),

for

n = m, m + 1, . . . ,

(17.30)

given in terms the associated Legendre functions. The nth eigenvalue n admits a total of
n + 1 linearly independent eigenfunctions, namely p0n (), . . . , pnn (). The functions pm
n ()
are, in fact, trigonometric polynomials of degree n. Here are the first few, written in
Fourier form:
p00 () = 1,
p11 () = sin ,

p01 () = cos ,
p02 () = 41 + 34 cos 2 ,

p03 ()
p23 ()
p04 ()
p24 ()
p44 ()

p13 ()
p33 ()
p14 ()
p34 ()

p12 () = 32 sin 2 ,
=

=
=
=
=

p22 () =

3
5
8 cos + 8 cos 3 ,
15
15
4 cos 4 cos 3 ,
9
5
35
64 + 16 cos 2 + 64 cos 4 ,
45
15
105
16 + 4 cos 2 16 cos 4 ,
315
105
105
8 2 cos 2 + 8 cos 4 .

3
2

=
=

3
2

cos 2 ,

3
15
8 sin 8 sin 3 ,
45
15
4 sin + 4 sin 3 ,
5
35
8 sin 2 16 sin 4 ,
105
105
4 sin 2 + 8 sin 4 ,

(17.31)

It is also instructive to plot the eigenfunctions in terms of the angle and compare with
those in Figure 17.1; see Figure Lphi .
3/7/03

752

c 2003

Peter J. Olver

At this stage, we have determined both angular components of our separable solutions
(17.20). Multiplyiing the two parts together results in the spherical angle functions
Ynm (, ) = cos m Pnm (cos ),
Ye m (, ) = sin m P m (cos ),
n

n = 0, 1, 2, . . . ,

(17.32)

m = 0, 1, . . . , n,

which are known as spherical harmonics. The spherical harmonics Y nm , Yenm satisfy the
spherical Helmholtz equation
S Ynm + n(n + 1) Ynm = 0 = S Yenm + n(n + 1) Yenm .

(17.33)

In other words, the spherical harmonics are the eigenfunctions for the spherical Laplacian
operator, (17.17), with associated eigenvalues n = n(n + 1) for n = 0, 1, 2, . . . . The
nth eigenvalue n admits a (2 n + 1)dimensional eigenspace, spanned by the spherical
harmonics
Yn0 (, ), Yn1 (, ), . . . , Ynn (, ), Yen1 (, ), . . . , Yenn (, ).

The omitted function Yen0 (, ) 0 is trivial, and so does not contribute.


Self-adjointness of the spherical Laplacian operator implies that the spherical harmonics are orthogonal with respect to the inner product
Z Z 2
ZZ
f g dS =
f (, ) g(, ) sin d d,
(17.34)
hf ;gi =
S1

where the surface area integral is over the sphere S1 = { k x k = 1 } of radius 1, cf. (B.40).
More correctly, self-adjointness only guarantees orthogonality for the harmonics corresponding to different eigenvalues. However, by our construction, the orthogonality formula (17.34) does, in fact, hold in general. The spherical harmonic norms can be explicitly
computed:
k Yn0 k2 =

4
,
2n + 1

k Ynm k2 = k Yenm k2 =

2 (n + m)!
.
(2 n + 1)(n m)!

(17.35)

Just as with the Fourier trigonometric functions, the case m = 0, where the spherical
harmonic Yn0 () does not depend upon , is special. A proof of this formula appears in
Exercise .
With some further work, it can be proved that the harmonic polynomials form a complete orthogonal system of functions on the unit sphere. This means that any reasonable,
e.g., piecewise C 1 , function h: S1 R can be expanded into a convergent spherical Fourier
series

n
h
i
X
X
c0,n 0
c0,0
+
Yn () +
h(, ) =
cm,n Ynm (, ) + e
cm,n Yenm (, )
(17.36)
2
2
n=1
m=1
in the spherical harmonics. Applying the orthogonality relations (17.34), the spherical
Fourier coefficients are given by the inner products
c0,n
3/7/03

2 h f ; Yn0 i
,
=
k Yn0 k2

cm,n

h f ; Ynm i
=
,
k Ynm k2
753

e
cm,n

h f ; Yenm i
=
,
k Ye m k2
n

n 0,
1 m n,

c 2003

Peter J. Olver

or, explicitly, using the formulae (17.35) for the norms,


cm,n
e
cm,n

(2 n + 1)(n m)!
=
2 (n + m)!
(2 n + 1)(n m)!
=
2 (n + m)!

Z
Z

2 Z
0

2 Z
0

h(, ) cos n Pn (cos ) sin d d,


(17.37)
h(, ) sin n Pn (cos ) sin d d.

The factor sin comes from the spherical surface area formula (B.40). As with an ordinary
Fourier series, the extra 21 was introduced in the c0,n terms in the series (17.37) so that
the formulae (17.37) are valid for all m, n. In particular, the constant term the spherical
harmonic series
c0,0
1
=
2
4

ZZ

S1

1
h dS =
4

2 Z

h(, ) sin d d

(17.38)

is the mean of the function f over the unit sphere.


To complete our solution to the Laplace equation on the solid ball, we still need to
analyze the ordinary differential equation (17.18) for the radial component w(r). Using
the fact that the original separation constant is = n(n + 1) for some non-negative integer
n 0, the radial equation (17.18) takes the form
r2 w00 + 2 r w 0 n(n + 1) w = 0.

(17.39)

As noted earlier, this is a second order linear equation of Euler type (3.74), and can be
solved by using the power ansatz w(r) = r . Substituting into the equation, we find the
exponent must satisfy the quadratic equation
2 + n(n + 1) = 0,

and hence

=n

or

= (n + 1).

Therefore, the two linearly independent solutions are


w1 (r) = r n

and

w2 (r) = r n1 .

(17.40)

Since we are only interested in solutions that remain bounded at r = 0 the center of
the ball we should just retain the first solution w(r) = r n in our analysis.
At this stage, we have solved all three ordinary differential equations for the separable solutions. We combine the results (17.22), (17.32), (17.40) together to produce the
spherically separable solutions (17.15) to the Laplace equation
Hnm (r, , ) = r n Ynm (, ) = r n cos m Pnm (cos ),
e m (r, , ) = r n Ye m (, ) = r n sin m P m (cos ),
H
n
n
n

n = 0, 1, 2, . . . ,
m = 0, 1, . . . , n,

(17.41)

known as harmonic polynomials. As the name suggests, they are, in fact, polynomial
3/7/03

754

c 2003

Peter J. Olver

functions of the rectangular coordinates x, y, z. The first few harmonic polynomials are
H00 = 1,

H20 = z 2 12 x2 12 y 2

H10 = z,
H11 = x,
e 1 = y,
H
1

H21 = 3 x z,
e 1 = 3 y z,
H
2

H22 = 3 x2 3 y 2 ,
e 2 = 6 xy,
H
2

H30 = z 3 32 x2 z 32 y 2 z

H31 = 6 x z 2 32 x3 23 x y 2
e 1 = 6 y z 2 3 x2 y 3 y 3
H
3
2
2
H32 = 15 x2 z 15 y 2 z
e 2 = 30 x y z
H
3
H33 = 15 x3 45 x y 2
e 3 = 45 x2 y 15 y 3 .
H

(17.42)

e m are homogeneous polynomials of degree n. Indeed, the harmonic


Note that Hnm and H
n
polynomials
e 1, . . . H
en
Hn0 , Hn1 , . . . , Hnn , H
n
n
form a basis for the subspace of all homogeneous polynomials of degree n that solve the
three-dimensional Laplace equation, which therefore has dimension 2 n + 1. (The unlisted
e 0 0 is trivial, and so is not part of the basis.) Plotting these functions in a visually
H
n
instructive manner is challenging. Since they depend upon three variables, we are in sore
need of a four-dimensional viewing system to properly display and appreciate their graphs.

As we shall see, the harmonic polynomials form a complete system, and therefore the
general solution to the Laplace equation on the sphere can be written as a series therein:
!
n
h
i
X
c0,n 0
m
m
e (r, , )
cm,n Hn (r, , ) + e
cm,n H
Hn (r, ) +
n
2
m=1
!

h
i
X
X
c0,0
c0,n 0
+
Yn () +
.
=
cm,n rn Ynm (, ) + e
cm,n rn Yenm (, )
2
2
n=1
m=1
(17.43)
To complete our solution to the boundary value problem, we substitute the harmonic
polynomial series solution into the Dirichlet boundary conditions on the unit sphere r = 1,
yielding

X
c0,0
u(r, , ) =
+
2
n=1

c0,0 X
u(1, , ) =
+
2
n=1

n
h
i
X
c0,n 0
Yn () +
cm,n Ynm (, ) + e
cm,n Yenm (, )
2
m=1

= h(, ).

(17.44)
In view of the preceding remarks, the coefficients cm,n , e
cm,n in this harmonic polynomial
series are given by the orthogonality formulae (17.37). If they are bounded which occurs
for all L2 functions h and also certain generalized functions, including the delta function
then it is not hard to prove that the series converges everywhere, and, in fact, uniformly
on any smaller ball k x k = r r0 < 1.
Interestingly, if we revert to rectangular coordinates, then the spherical Fourier series

3/7/03

755

c 2003

Peter J. Olver

(17.43) takes the form


!
n
i
h
X
c0,n 0
e m (x, y, z)
.
Hn (x, y, z) +
cm,n Hnm (x, y, z) + e
cm,n H
n
2
m=1
(17.45)
The summand at order n is, in fact, a homogeneous polynomial of degree n. Therefore, the
Fourier series expands the function into a power series which is, in fact, the Taylor series
expansion for the harmonic function u at the origin! Any convergent Taylor expansion
converges to an analytic function. Therefore, just like their two-dimensional siblings,
harmonic functions are, in fact, analytic. According to the preceding paragraph, the
radius of convergence of the spherical harmonic Fourier/Taylor series is at least one, and
so u(x, y, z) is analytic inside the entire ball no matter how wild its boundary values
are.
The constant term in such a Taylor series can be identified with the value of the
function at the origin. On the other hand, the orthogonality formula (17.38) tells us that
ZZ
c0,0
1
=
u dS.
u(0, 0, 0) =
(17.46)
2
4
S1

X
c0,0
+
u(x, y, z) =
2
n=1

Therefore, we have established the three-dimensional version of the planar Theorem 14.7:
the value of the harmonic function at the center of the sphere is equal to the average of
its values u = h on the spheres surface.
Theorem 17.3. If u(x) is a harmonic function for all x R 3 , then u is analytic
in . Moreover, its value at a point x0 ,
ZZ
1
u(x0 ) =
u dS,
(17.47)
4 a2
Sa (x0 )
is equal to the average of its values on any sphere Sa (x0 ) = { k x x0 k = a } centered at the
point provided u is harmonic on the entire enclosed ball Ba (x0 ) = { k x x0 k a } .
Proof : It is easily checked that, under the hypothesis of the theorem,
U (x) = u(a x + x0 )
is harmonic on the unit ball k x k 1, and hence solves the boundary value problem (17.13)
with boundary values h(x) = U (x) = u(a x + x0 ) on k x k = 1 coming from the values of
u on the sphere Sa (x0 ). By the preceding remarks, U (x) is analytic for k x k < 1, and so
u(x) = U ((x x0 )/a) is analytic inside Ba (x0 ), and, in particular at x0 . Since x0 was
arbitrary, this proves analyticity of u everywhere in E. Moreover, according to (17.46),
ZZ
ZZ
1
1
u dS,
U dS =
u(x0 ) = U (0) =
4
4 a2
Sa (x0 )
S1
which proves the result.

Q.E.D.

Arguing as in Corollary 14.8, we establish a corresponding maximum principle for


harmonic functions of 3 variables.
3/7/03

756

c 2003

Peter J. Olver

Corollary 17.4. A harmonic function cannot have a local maximum or minimum


at any interior point of its domain of definition.
For instance, this result implies that a body in thermal equilibrium can achieve its
maximum and minimum temperature only on the boundary of the domain. In physical
language, heat energy must flow away from any internal maximum and towards any internal
minimum. Thus, a local maximum or minimum of temperature would preclude the body
being in thermal equilibrium.
Example 17.5. In this example, we shall determine the electrostatic potential inside
a hollow sphere when the upper and lower hemispheres are held at different constant potentials. This device is called a spherical capacitor and is realized experimentally by separating
the two charged hemispheres by a thin insulating ring at the equator. A straightforward
scaling argument allows us to choose our units so that the sphere has radius 1, while
the potential is set equal to 1 on the upper hemisphere and 0 or grounded on the lower
hemisphere. Therefore, we need to solve the Laplace equation u = 0 inside a solid ball
k x k < 1, with Dirichlet boundary conditions

1,
z > 0,
u(x, y, z) =
on
k x k = 1.
(17.48)
0,
z < 0,
The solution will be prescribed by a harmonic polynomials series (17.45) whose coefficients are determined by the boundary values (17.48). Before making on the required
computation, let us first note that since the boundary data does not depend upon the
meridial angle , the solution u = u(r, ) will also be independent of . Therefore, we
need only consider the -independent spherical harmonics, which are those with m = 0,
and hence

1 X
1 X
0
c H (x, y, z) =
c rn Pn (cos ),
u(r, ) =
2 n=0 n n
2 n=0 n

where we abbreviate c0,n = cn . The boundary conditions require


(

1,
0 < 21 ,
1 X
u(1, ) =
cn Pn (cos ) = f () =
1
2 n=0
0,
2 < .

The coefficients are given by (17.37), which, in the case m = 0, reduces to


Z 1
ZZ
Z /2
2n + 1
0
Pn (t) dt.
Pn (cos ) sin d = (2 n + 1)
cn =
f Yn dS = (2 n + 1)
2
0
0
S
(17.49)
The first few are
c0 = 1,

c1 = 32 ,

c2 = 0,

c3 = 87 ,

c4 = 0,

...

Therefore, the solution has the explicit Taylor expansion


u=
=
3/7/03

1
2
1
2

+
+

3
4
3
4

21 3
35 3
64 r cos 64 r
2
2
7 3
21
8 z 16 (x + y ) z +

r cos
z

757

cos 3 +
.

c 2003

Peter J. Olver

Note in particular that the value u(0, 0, 0) = 21 at the center of the sphere is the average
of its boundary values, in accordance with Corollary 17.4.
Remark : The same function solves the problem of thermal equilibrium in a solid
sphere with the upper hemisphere held at temperature 1 and the lower hemisphere at 0 .
Example 17.6. A closely related problem is to determine the electrostatic potential
outside a spherical capacitor. As in the preceding example, we take our capacitor of radius
1, with electrostatic charge + 1 on the upper hemisphere and 0 on the lower hemisphere.
Here, we need to solve the Laplace equation
u = 0,

k x k > 1,

in the unbounded domain outside a solid unit ball, subject to Dirichlet boundary conditions

0,
z > 0,
u=
on the unit sphere
k x k = 1.
1,
z < 0,
We expect the potential to be small at large distances r = k x k 1 away from the
capacitor. Therefore, the non-constant harmonic polynomial solutions will not help us
solve this problem, since they tend to as k x k .
However, by returning to our original separation of variables argument, we can construct a different class of solutions with the desired decay properties. When we solved
the radial equation (17.39), we discarded the solution w2 (r) = r n1 because it had a
singularity at the origin. In the present situation, the behavior of the function at r = 0 is
irrelevant; our current requirement is that the solution decays as r , and this is now
valid. Therefore, we can use the functions
Knm (x, y, z) = r 2 n1 Hnm (x, y, z) = r n1 Ynm (, ) = r n1 cos m Pnm (cos ),
(17.50)
e m (x, y, z) = r n1 Ye m (, ) = r n1 sin m P m (cos ),
e m (x, y, z) = r 2 n1 H
K
n

for solving such exterior problems. In the present case, we only need the functions that
are independent of , which means m = 0. We write the resulting solution as a series

1 X
1 X
0
u(r, ) =
cn Kn (x, y, z) =
cn r n1 Pn (cos ).
2 n=0
2 n=1

The boundary conditions

1 X
u(1, ) =
c P (cos ) = f () =
2 n=1 n n

1,
0,

0 < 21 ,
1
2 < ,

are identical with the previous example. Therefore, the coefficients are given by (17.49),
leading to the series expansion
u=

1
3 cos 21 cos + 35 cos 3
1
3z
14 z 3 21 (x2 + y 2 ) z
+

=
+

+ ,
2r
4 r2
64 r 3
2 r 4 r3
16 r 5
(17.51)

3/7/03

758

c 2003

Peter J. Olver

p
where r = x2 + y 2 + z 2 . Interestingly, at large distances, the higher order terms become
negligible, and the potential looks like that associated with a point charge of magnitude 12
the average of the potential over the sphere that is concentrated at the origin. This
is indicative of a general fact; see Exercise .

17.3. The Greens Function.


We now turn to the inhomogeneous form of Laplaces equation the Poisson equation
u = f

for all

x .

(17.52)

In applications, f (x) = f (x, y, z) represents some form of external forcing inside the solid
domain. To uniquely specify the solution, we need to impose appropriate boundary conditions Dirichlet, Neumann, or mixed. We shall mostly concentrate on the homogeneous
boundary variational problem.
As we learned in Chapters 10 and 14, the solution to the Poisson equation for a
general inhomogeneity f (x) can be found by a superposition formula based on the Greens
function, which is defined to be the particular solution corresponding to a delta function
inhomogeneity that is concentrated at a single point in the domain. Thus, for each =
(, , ) , the Greens function G(x; ) = G(x, y, z; , , ) is the unique solution to the
Poisson equation
u = (x ) = (x ) (y ) (z )

for all

x ,

(17.53)

subject to the chosen homogeneous boundary conditions. The solution to the general
Poisson equation (17.52) is then obtained by superposition: We write the forcing function
ZZZ
f (x, y, z) =
f (, , ) (x ) (y ) (z ) d d d

as a linear superposition of delta functions. By linearity, the solution


ZZZ
u(x, y, z) =
f (, , ) G(x, y, z; , , ) d d d

(17.54)

is then given as the same superposition of the Greens function solutions.


The Greens Function on the Entire Space

Except in a few specific instances, the explicit formula for the Greens function is
difficult to find. Nevertheless, certain general, useful features can be established. The
starting point is to investigate the Poisson equation (17.53) when the domain = R 3 is
all of three-dimensional space. Since the Laplacian is invariant under translations we can,
without loss of generality, place our delta impulse at the origin, and solve the particular
case
u = (x) ,
x R3.
Since (x) = 0 for all x 6= 0, the desired solution will, in fact, be a solution to the
homogeneous Laplace equation
u = 0,
3/7/03

759

x 6= 0,
c 2003

Peter J. Olver

save, possibly, for a single singularity concentrated at the origin. We impose boundary
constraints by seeking a solution that goes to zero, u 0, at large distances k x k .
The Laplace equation models the equilibria of a homogeneous, isotropic medium,
and so is also invariant under rotations. This indicates that, in any radially symmetric
configuration, the solution u = u(r) should only depend upon the distance from the origin,
r = k x k, and not the angular direction. Referring to the spherical coordinate form (17.14)
of the Laplacian operator, if u only depends upon r, its derivatives with respect to the
angular coordinates , are zero, and so u(r) solves the ordinary differential equation
d2 u 2 du
+
= 0.
dr2
r dr

(17.55)

This equation is, in effect, a first order linear ordinary differential equation for v = du/dr
and hence is easy to solve. The solutions are of the form
du
= v(r) = b log r,
dr

and hence

u(r) = a +

b
,
r

where a, b are arbitrary constants. The constant solution u(r) = a does not die away at
large distances, nor does it have a singularity at the origin. Therefore, if our intuition is
valid, the desired solution should be of the form
u=

b
b
b
.
=
=p
r
kxk
x2 + y 2 + z 2

(17.56)

Indeed, this function is harmonic solves Laplaces equation everywhere away from
the origin, and has a singularity at x = 0.
Remark : This solution is, up to constant multiple, the three-dimensional Newtonian
graviational potential due to a point mass at the origin. Its gradient

b
bx
f (x) =
=
.
kxk
k x k3
defines the gravitational force vector at the point x. When b > 0, the force vector f (x)
points in the direction of the mass concentrated at the origin. Its magnitude
kf k =

b
b
= 2
2
kxk
r

is proportional to one over the squared distance, and so satisfies the well-known inverse
square law of three-dimensional Newtonian gravity.
The inverse square law also models the electrostatic forces between charged bodies.
Thus, (17.56) can be interpreted as the electrostatic potential on a charged mass at position
x due to a electric charge that is concentrated at the origin. The constant b is positive
when the charges are of opposite signs, leading to an attractive force, and negative in the
repulsive case of like charges.
3/7/03

760

c 2003

Peter J. Olver

Returning to our problem, our remaining task is to fix the multiple b such that the
Laplacian of our candidate solution (17.56) has a delta function singularty at the origin;
equivalently, we must find c such that
r1 = c (x).

(17.57)

We already know that this equation holds away from the origin, since (x) = 0 when
x 6= 0. To investigate near the singularity, we integrate both sides of (17.57) over a small
solid ball B = { r = k x k = } of radius :
ZZZ
ZZZ
1

r dx dy dz =
c (x) dx dy dz = c,
(17.58)
B

where we used the definition of the delta function to evaluate the right hand side. On the
other hand, since r 1 = r 1 , we can use the divergence theorem (B.82) to evaluate
the left hand integral, whence

ZZ
ZZZ
ZZZ
1

1
1
dS,
r dx dy dz =
r dx dy dz =
r
S n
B
B
where the surface integral is over the bounding sphere S = B = { k x k = }. The
normal n to the sphere points in the radial direction, and hence the normal derivative
coincides with differentiation with respect to r. Therefore,

1
1
1
=
= 2.
n r
r r
r
The surface integral can now be explicitly evaluated:

ZZ
ZZ
ZZ
1
1

1
dS =
dS
=

dS = 4 ,
2
2
r
S r
S
S n
since S has surface area 4 2 . Substituting this result back into (17.58), we conclude
that
c = 4 ,
and hence
r 1 = 4 (x).
(17.59)
This is our desired formula! Therefore, the Greens function for a delta function impulse
at the origin is
1
1
1
p
=
=
.
(17.60)
G(x, y, z) =
4 r
4 k x k
4 x2 + y 2 + z 2
If the singularity is concentrated at the point = (, , ) instead of the origin, then
we merely translate the preceding solution. This leads immediately to the Greens function
G(x; ) = G(x ) =

1
1
p
=
2
4 k x k
4 (x ) + (y )2 + (z )2

(17.61)

on all of space. As a consequence of the superposition formula (17.54), we have proved


the following integral formula for the solutions to the Poisson equation on all of threedimensional space.
3/7/03

761

c 2003

Peter J. Olver

Theorem 17.7. A particular solution to the Poisson equation


u = f
is given by
1
u? (x, y, z) =
4

ZZZ

R3

x R3

for

(17.62)

f (, , ) d d d
p
.
(x )2 + (y )2 + (z )2

(17.63)

The general solution to the Poisson equation is

u(x, y, z) = u? (x, y, z) + w(x, y, z),


where w(x, y, z) is an arbitrary harmonic function.
Example 17.8. In this example, we compute the gravitational (or electrostatic)
potential in three-dimensional space due to a uniform solid ball, e.g., a spherical planet
such as the earth. By rescaling, it suffices to consider the case when the forcing function

1,
k x k < 1,
f (x) =
0,
k x k > 1,
is equal to 1 inside a solid ball of radius 1 and zero outside. The particular solution to the
resulting Poisson equation (17.62) is given by the integral
ZZZ
1
1
?
d d d.
(17.64)
u (x) =
4
k k<1 k x k
Clearly, since the forcing function is radially symmetric, the solution u = u(r) is also
radially symmetric. To evaluate the integral, then, we can take x = (0, 0, z) to lie on the
z axis, so that r = k x k = | z |. We use cylindrical coordinates = ( cos , sin , ), so
that
p
k x k = 2 + (z )2 .

See Figure Psph . The integral in (17.64) can then be explicitly computed:
Z 1 Z 1 2 Z 2
1
d d d
p
=
4 1 0
2 + (z )2
0
1

| z | 1,

Z
3|z| ,

1 1 p
1 + z 2 2 z | z | d =
=
2

2 1

z +1,
| z | 1.
6
2

Therefore, by radial symmetry, the solution is


1

3r
u(x) =
2

r +1,
6
2
3/7/03

762

r = k x k 1,

(17.65)

r = k x k 1,
c 2003

Peter J. Olver

plotted, as a function of r = k x k in Figure solidball . Note that, ouside the solid ball, the
solution is a Newtonian potential corresponding to a point mass of magnitude 43 , which
is the same as the total mass of the planet. This is a well-known result in gravitation and
electrostatics the exterior potential due to a spherically symmetric mass (or electric
charge) is the same as if all its mass were concentrated at its center. Interestingly, at the
center of the ball, the potential is equal to 21 , not zero, which is its asymptotic value at
large distances.
Bounded Domains and the Method of Images
Suppose we now wish to solve the inhomogeneous Poisson equation (17.52) on a
bounded domain R 3 . The spatial Greens function (17.61) is a particular solution
to the underlying inhomogeneous equation
u = (x ),

x ,

(17.66)

but it does not have the proper boundary values on . However, as we know by the
principles of linearity, the general solution to any inhomogeneous linear equation has the
form
1
u(x) =
v(x),
(17.67)
4 k x k
where the first summand is a particular solution, which we now know, while v(x) is an arbitrary solution to the homogeneous equation v = 0, i.e., an arbitrary harmonic function.
The minus sign is for later convenience. The solution (17.67) satisfies the homogeneous
boundary conditions provided the boundary values of v(x) match those of the Greens
function. Let us state the result in the case of the Dirichlet boundary value problem.
Theorem 17.9. The Greens function for the homogeneous Dirichlet boundary value
problem for the Poisson equation
u = f,

in a domain R 3 has the form

x ,

G(x; ) =

u = 0,

x ,

1
v(x; )
4 k x k

(17.68)

where v(x; ) is the harmonic function of x that satisfies


v(x; ) =

1
4 k x k

for all

x .

In this manner, we have reduced the detemination of the Greens function to the
solution to a particular set of Laplace boundary value problems, parametrized by the
point . In certain cases, the method of images will produce an explicit formula for
the Greens function. As in the planar version presented in Section 14.3, the idea is to
match the boundary values of the Greens function due to a delta impulse at a point inside
the domain with one or more Greens functions corresponding to impulses at points outside
the domain the image points.
3/7/03

763

c 2003

Peter J. Olver

Figure 17.2.

Method of Images for the Unit Sphere.

The case of a solid ball of radius 1 with Dirichlet boundary conditions is the easiest
to handle. Indeed, the same geometrical construction that we used for a disk in the plane,
and illusrated in Figure 17.2 applies to a solid ball in three-dimensional space. Although
this is the same as Figure 14.7, we are now interpreting the picture as a three-dimensional
diagram, and so the circle represents the unit sphere. We choose the image point given by
inversion:

1
=
,
so that
kk =
.
2
kk
kk
Applying the same similar triangles argument as in the planar case, we deduce that
kxk
kx k
kk
=
=
,
kxk
kk
kx k

and therefore

k x k = 1.

As a result, the function


v(x, ) =

1
1
kk
kk
=
4 k x k
4 k k k2 x k

has the same boundary values on the unit sphere as the free space Greens function:
1
kk
1
=
4 k x k
4 k x k

whenever

k x k = 1.

We conclude that the difference (17.68) between the two


1
G(x; ) =
4

1
kk

kx k
k k k2 x k

(17.69)

has the required properties of the Greens function: it satisfies the Laplace equation inside
the unit ball except at the singularity at x = , while G(x; ) = 0 has homogeneous
Dirichlet conditions on the boundary k x k = 1.
With the Greens function in hand, we can apply the general superposition formula (17.54) to arrive at a general formula for the solution to the Dirichlet boundary
value problem for the Poisson equation in the unit ball.
3/7/03

764

c 2003

Peter J. Olver

Theorem 17.10. The solution u(x) to the homogeneous Dirichlet boundary value
problem
u = f,
k x k < 1,
u = 0,
kxk = 1
is given by the integral

ZZZ
kk
1
1

u(x) =
f () d d d.
4
kx k
k k k2 x k
k k1

(17.70)

Example 17.11. In this example, we compute the electrostatic potential inside a


sphere due to a small solid ball at its center. The outside sphere k x k = 1 is assumed to
be grounded, and so the potential satisfies the homogeneous Dirichlet boundary conditions
there. The forcing function due to the interior charged sphere is take in the form

1,
k x k < ,
f (x) =
0,
< k x k < 1.
Using radial symmetry, the solution u = u(r) is also radially symmetric.
The Greens function can also be used to solve the inhomogeneous boundary value
problem
u = 0,
x ,
u = h,
x .
(17.71)
The same argument as we applied in the two-dimensional situation works here, and the
solution is
ZZ
G(x; )
u(x) =
h() dS.
(17.72)
n

In the case when is a solid ball, this integral formula effectively sums the spherical
harmonic series (17.43).

17.4. The Heat Equation in Three-Dimensional Media.


Thermal diffusion in a homogeneous solid body R 3 is governed by the threedimensional variant of the heat equation
2

u 2u 2u
u
(17.73)
= u =
+ 2 + 2 ,
(x, y, z) ,
t
x2
y
z
The coefficient > 0 measures the thermal diffusivity of the body. Positivity of the diffusivity is required in order that the heat equation be well-posed; see Section 13.1 for details.
The physical derivation of the heat equation is exactly the same as the two-dimensional
version (16.1), and does not need to be repeated in detail. Briefly, the temperature gradient
is proportional to the heat flux vector, w = u, while its divergence is proportional to
the rate of change of temperature, ut = w. Combining these two physical laws and
assuming homogeneity, whereby and are constant, produces (17.73) with = /.
3/7/03

765

c 2003

Peter J. Olver

As always, we need to impose suitable boundary conditions. These are either the
Dirichlet conditions u = h that specify the boundary temperature, or homogeneous Neumann conditions u/n = 0 corresponding to an insulated boundary, or a mixture of the
two. Given the initial temperature of the body
u(t0 , x, y, z) = f (x, y, z)

(17.74)

at the initial time t0 , there is a unique solution u(t, x, y, z) to the initial-boundary value
problem for all subsequent times t t0 ; see [32] for a proof.
To keep matters reasonably simple, we initially restrict our attention to the homogeneous boundary conditions. The general separation of variables method works as before.
One begins by imposing an exponential ansatz u(t, x) = e t v(x). Substituting into
the differential equation and canceling the exponentials, we deduce that v satisfies the
Helmholtz eigenvalue problem
v + v = 0,
subject to the relevant boundary conditions. For Dirichlet and mixed boundary conditions,
the Laplacian is a positive definite operator, and hence the eigenvalues are all strictly
positive,
0 < 1 2 ,
with
n ,
as n . Linear superposition implies that the solution can be written as a generalized
Fourier series

X
u(t, x) =
cn en t vn (x)
(17.75)
n=1

in the corresponding eigenfunctions vn (x). The coefficients cn are uniquely prescribed by


the initial condition (17.74); for t0 = 0, the initial condition takes the form
u(0, x) =

cn vn (x) = f (x).

(17.76)

n=1

Self-adjointness of the boundary value problem implies that the eigenfunctions are mutually
orthogonal, and hence we can invoke the usual orthogonality formulae
ZZZ
f (x) vn (x) dx dy dz
h f ; vn i

= ZZZ
(17.77)
cn =
k v n k2
2
vn (x) dx dy dz

in order to compute the Fourier coefficients. Since the higher modes the terms for n 0
go to zero extremely rapidly, the solution can be well approximated by the first few
terms in its Fourier expansion. As a consequence, the heat equation rapidly smooths out
discontinuities and noise in the initial data, and so can be used to denoise three-dimensional
and video images although better nonlinear techniques are now available, [107]. The
solution u(t, x) decays exponentially fast to thermal equilibrium u(t, x) 0, the same
temperature as imposed on (part of) the boundary, at a rate equal to the smallest positive
eigenvalue 1 > 0.
3/7/03

766

c 2003

Peter J. Olver

Unfortunately, the explicit formulae for the eigenfunctions and eigenvalues are known
only for a few particular domains, [90]. Most explicit solution techniques for the Helmholtz
boundary value problem rely on a further separation of variables. In a rectangular domain,
one separates into a product of functions depending upon the individual Cartesian coordinates, and the eigenfunctions are written as products of trigonometric and hyperbolic
functions. See Exercise for details. In a cylindrical domain, the separation is effected
in cylindrical coordinates, and leads to separable solutions in terms of trigonometric and
Bessel functions, as outlined in Exercise . The most interesting and enlightening case is
a spherical domain, and we treat this particular problem in complete detail.
Heating of a Ball
Let us solve the problem of heat propagation in a solid spherical body, e.g., the earth .
For simplicity, we take the diffusivity = 1, and consider the heat equation on a solid
spherical domain B1 = { k x k < 1 } of radius 1 subject to homogeneous Dirichlet boundary
conditions. Once we know how to solve this particular case, an easy scaling argument
outlined in Exercise will allow us to find the solution for a ball of arbitrary radius and
with a general diffusion coefficient.
As usual, when dealing with a spherical geometry, we adopt spherical coordinates
r, , , (B.64), in terms of which the heat equation takes the form
u
2u
cos u
2 u 2 u
1 2u
1
+
+
,
=
+
+
t
r2
r r
r2 2
r2 sin r2 sin2 2

(17.78)

where we have used our handy formula (17.14) for the Laplacian in spherical coordinates.
The diffusive separation of variables ansatz u(t, r, , ) = e t v(r, , ) requires us to
analyze the Helmholtz equation
2 u 2 u
2u
1 2u
cos u
1
+
+
+
+
+ u = 0
r2
r r
r2 2
r2 sin r2 sin2 2

(17.79)

on the unit ball = { r < 1 } with homogeneous Dirichlet boundary conditions. To solve
the spherical coordinate form of the Helmholtz equation, we invoke a further separation of
variables. To this end, we separate off the radial coordinate first by setting
v(r, , ) = w(r) (, ).
The function must be 2 periodic in and well-defined at the poles = 0, . Substituting
this ansatz in (17.79), and separating all the r-dependent terms from the terms depending
upon the angular variables , leads to a pair of differential equations; the first is an
ordinary differential equation
r2 w00 + 2 w0 + ( r 2 )w = 0,

(17.80)

In this perhaps overly simplified model, we are assuming that the earth is composed of a
completely homogeneous and isotropic solid material.

3/7/03

767

c 2003

Peter J. Olver

for the radial component w(r), while the second is a familiar partial differential equation

1 2
(17.81)
S + =
sin
+
+ = 0,
sin

sin2 2
for its angular counterpart (, ). The operator S is the spherical Laplacian (17.17)
analyzed in Section 17.2. As we learned, its eigenvalues have the form n = n(n + 1) for
n = 0, 1, 2, 3, . . .. Each eigenvalue admits 2 n + 1 linearly independent eigenfunctions
the spherical harmonics Ynm , Yenm defined in (17.32).
The radial ordinary differential equation (17.80) can be solved by letting

p(r) = r w(r).
We manually compute the derivatives
1
w = p,
r

1 dp
1
dw
=
3/2 p,
dr
r dr
2r

1 d2 p
1 dp
3
d2 w

=
3/2
+ 5/2 p.
2
2
dr
dr
r dr
2r
4r

Substituting
into (17.80) with = n = n(n + 1), and multiplying the resulting equation

by r, we discover that p(r) must solve the differential equation


d2 p
dp h 2
r
+r
+ r n +
dr2
dr
2

1
2

2 i

p = 0.

(17.82)

The latter equation is identical to the rescaled Bessel equation (16.48) in which the order
m = n + 12 is a half integer, i.e., m = 21 , 32 , 25 , . . . . Therefore, the solution to (17.82) that
remains bounded at r = 0 is (up to scalar multiple) the rescaled Bessel function

r .
p(r) = Jn+1/2
The corresponding solution

w(r) = r 1/2 Jn+1/2

(17.83)

to (17.80) is important enough to warrant a special name.

Definition 17.12. The spherical Bessel function of order n 0 is defined by the


formula
r

Sn (x) =
J
(x)
(17.84)
2 x n+1/2
q
involving the Bessel function of half integer order. The multiplicative factor 2 is included

in the definition so as to avoid annoying factors of and 2 in all subsequent formulae.


Surprisingly, unlike the Bessel functions of integer order, the spherical Bessel functions
are elementary functions! According to formula (C.55), the spherical Bessel function of
order 0 is
sin x
S0 (x) =
.
(17.85)
x
3/7/03

768

c 2003

Peter J. Olver

The higher order spherical Bessel functions can be obtained by use of a general recurrence
relation
n
dS
(17.86)
Sn+1 (x) = n + Sn (x),
dx
x
which is a consequence of Proposition C.13. The next few are, therefore,
dS0
cos x sin x
=
+ 2 ,
dx
x
x
dS1
S
sin x 3 cos x 3 sin x
S2 (x) =
+
,
+ 1 =

dx
x
x
x2
x3
dS2
2 S1 cos x 6 sin x
15 cos x 15 sin x
S3 (x) =
+

+
.
2
dx
x
x
x
x3
x4
S1 (x) =

(17.87)

Our radial solution (17.83) is, apart from an inessential constant multiple that we ignore,
a rescaled spherical Bessel function of order n:

r .
wn (r) = Sn

So far, we have not taken into account the homogeneous Dirichlet boundary condition
at r = 1. This requires

wn (1) = 0,
and hence
Sn
= 0.

Therefore, must be a root of the nth order spherical Bessel function. We use the
notation
0 < 1,n < 2,n < 3,n <
to denote the successive roots of the nth order spherical Bessel function, so that
Sn (k,n ) = 0

for

k = 1, 2, 3, . . . .

In particular the roots of the zeroth order function S0 (x) = sin x/x are just the integer
multiples of , so
k,0 = k
for
k = 1, 2, . . . .
A table of all spherical Bessel roots that are < 13 follows. The rows of the table are
indexed by n, the order, while the columns are indexed by k, the root number.
Re-assembling the individual pieces, we have now demonstrated that the separable
eigenfunctions of the Helmholtz equation on a solid ball of radius 1, when subject to
homogeneous Dirichlet boundary conditions, are products of spherical Bessel functions
and spherical harmonics,
vek,m,n (r, , ) = Sn (k,n r) Yenm (, ). (17.88)

vk,m,n (r, , ) = Sn (k,n r) Ynm (, ),


The corresponding eigenvalues
2
k,n = k,n
,

n = 0, 1, 2, . . . ,

k = 1, 2, 3, . . . ,

(17.89)

are given by the squared spherical Bessel roots. Since there are 2 n + 1 independent
spherical harmonics of order n, each eigenvalue k,n admits 2 n + 1 linearly independent
3/7/03

769

c 2003

Peter J. Olver

Table of Spherical Bessel Roots k,n


0

...

3.1416

6.2832

9.4248

...

4.4934

7.7253

10.9041

5.7635

9.0950

8.1826

11.7049

9.3558

10.5128

11.6570

8
..
.

12.7908
..
.

12.9665
..
.
..
.
..
.

12.3229
..
.
..
.

12.5664
..
.
..
.

eigenfunctions, namely vk,0,n , . . . , vk,n,n , vek,1,n , . . . , vk,n,n . (We omit the trival case vek,0,n
0.) In particular, the radially symmetric solutions are the eigenfunctions for m = n = 0,
namely
vk (r) = vk,0,0 (r) = S0 (k,0 r) =

sin k r
,
k r

k = 1, 2, . . . .

(17.90)

It can be shown that the separable solutions (17.88) form a complete system of eigenfunctions, [31].
We have thus completely determined the basic separable solutions to the heat equation
on a solid unit ball subject to homogeneous Dirichlet boundary conditions. They are
products of exponential functions of time, spherical Bessel functions of the radius and the
spherical harmonics:
2

uk,m,n (t, r, , ) = e k,n t Sn (k,n r) Ynm (, ),


2

u
ek,m,n (t, r, , ) = e k,n t Sn (k,n r) Yenm (, ).

(17.91)

The general solution can be written as an infinite FourierBesselspherical harmonic


series in these fundamental modes
u(t, r, , ) =
(17.92)

X
n
h
i
X
X
c
2
0,n
e k,n t Sn (k,n r)
=
cm,n Ynm (, ) + e
cm,n Yenm (, )
.
Yn0 (, ) +
2
n=0
m=1
k=1

The series coefficients are uniquely prescribed by the initial data, based on the usual
orthogonality relations among the eigenfunctions. Detailed formulae are relegated to the
3/7/03

770

c 2003

Peter J. Olver

exercises. In particular, the slowest decaying mode is the spherically symmetric function
uk,0,0 (t, r) =

sin r

(17.93)

corresponding to the smallest eigenvalue 1,0 = 2 . Therefore, the overall decay rate to
thermal equilibrium of a unit sphere is at a rate equal to 2 9.8696, or, to a very rough
approximation, 10.
The Fundamental Solution to the Heat Equation
For the heat equation (as well as more general diffusion equations), the fundamental
solution measures the response of the body to a concentrated unit heat source. Thus, given
a point = (, , ) in the domain occupied by the body, the fundamental solution
u(t, x) = F (t, x; ) = F (t, x, y, z; , , )
solves the initial-boundary value problem
ut = u,

u(0, x) = (x ),

for

x ,

t > 0,

(17.94)

subject to the homogeneous boundary conditions of the required form which can be
either Dirichlet, Neumann or mixed.
In general, there is no explicit formula for the fundamental solution, although in
certain domains one can construct a (generalized) Fourier series in the associated eigenfunctions. The one case amenable to a complete analysis is when the heat is distributed
over all of three-dimensional space, so = R 3 . To this end, we recall that Lemma 16.2
showed how to construct solutions of the two-dimensional heat equation as products of
one-dimensional solutions. In a similar manner, if v(t, x), w(t, x) and q(t, x) are any three
solutions to ut = uxx , then the product
u(t, x, y) = v(t, x) w(t, y) q(t, z)

(17.95)

is a solution to the three-dimensional heat equation ut = (uxx + uyy + uzz ). In particular,


choosing
2
1
v(t, x) =
e (x) /4 t ,
2 t

2
1
w(t, y) =
e (y) /4 t ,
2 t

2
1
q(t, z) =
e (z) /4 t ,
2 t

to all be one-dimensional fundamental solutions, we are immeditely led to the threedimensional fundamental solution in the form of a three-dimensional Gaussian kernel.
Theorem 17.13. The fundamental solution
2

e k x k /4 t
F (t, x; ) = F (t, x ) =
8 ( t)3/2

(17.96)

solves the three-dimensional heat equation ut = u on R 3 with an initial temperature


equal to a delta function concentrated at the point x = .
3/7/03

771

c 2003

Peter J. Olver

Thus, the initially concentrated heat energy immediately begins to spread out in a
radially symmetric manner, with a miniscule, but nonzero effect felt at arbitrarily large
distances away from the initial concentration. At each individual point x R 3 , after an
initial warm-up, the temperature decays back to zero at a rate proportional to t 3/2
even more rapidly than in two dimensions because, intuitively, there are more directions
for the heat energy to disperse.
To solve the general initial value problem with the initial temperature u(0, x, y, z) =
f (x, y, z) distributed over all of space, we first write
ZZZ
f (x, y, z) =
f () (x ) d d d
as a linear superposition of delta functions. By linearity, the solution to the initial value
problem is given by the corresponding superposition
ZZZ
2
1
f () e k x k /4 t d d d.
(17.97)
u(t, x) =
3/2
8 ( t)

of the fundamental solutions. Since the fundamental solution has exponential decay as
k x k , the superposition formula is valid even for initial temperature distributions
which are moderately increasing at large distances. We remark that the integral (17.97)
has the form of a three-dimensional convolution
ZZZ
u(t, x) = F (t, x) f (x) =
f () F (t, x ) d d d
(17.98)

of the initial data with a one-parameter family of increasingly spread out Gaussian filters.
Thus, convolution with a Gaussian kernel has the same smoothing effect on functions.
Example 17.14.
More general situations must be solved by numerical integration and approximation.

17.5. The Wave Equation in Three-Dimensional Media.


Certain classes of vibrations of a uniform solid body are governed by the threedimensional wave equation
utt = c2 u = c2 (uxx + uyy + uzz ).

(17.99)

The solution u(t, x) = u(t, x, y, z) represents a scalar-valued displacement of the body


at time t and position x = (x, y, z) R 3 . For example, u(t, x) might represent the
radial displacement of the body. One imposes suitable boundary conditions, e.g., Dirichlet,
Neumann or mixed, on , along with a pair of initial conditions
u
(17.100)
(0, x) = g(x),
x ,
t
that specify the initial displacement and initial velocity of the body. As long as the
initial and boundary data are reasonably nice, there exists a unique solution to the initialboundary value problem for all < t < . Thus, in contrast to the heat equation, one
can follow solutions to the wave equation backwards in time; see also Exercise .
u(0, x) = f (x),

3/7/03

772

c 2003

Peter J. Olver

Remark : Since the solution u(t, x) to the wave equation is scalar-valued, it cannot
measure the full range of possible three-dimensional motions of a solid body. The more
complicated dynamical systems governing the elastic motions of solids are discussed in
Exercise .
Remark : The wave equation also governs the propagation of electromagnetic waves,
such as light, radio, X-rays, etc., in a homogeneous medium, including (in the absence of
gravitational effects) empty space. Each individual component of the electric and magnetic
vector fields E, B satisfy the wave equation, in which c denotes the velocity of light.
We initially concentrate on the homogeneous boundary value problem. The fundamental vibrational modes are found by imposing our usual trigonometric ansatz
u(t, x, y, z) = cos t v(x, y, z).
Substituting into the wave equation (17.99), we discover (yet again) that v(x, y, z) must
be an eigenfunction solving the associated Helmholtz eigenvalue problem
2
,
(17.101)
c2
along with the relevant boundary conditions. In the positive definite cases, i.e., Dirichlet
and mixed boundary conditions, the eigenvalues k = k2 /c2 > 0 are all positive. Each
eigenfunction vk (x, y, z) yields two vibrational solutions
v + v = 0,

where

uk (t, x, y, z) = cos k t vk (x, y, z),


u
ek (t, x, y, z) = sin k t vk (x, y, z),

of frequency k = c k equal to the square root of the corresponding eigenvalue. The


general solution is a quasi-periodic linear combination
u(t, x, y, z) =

k=1

ak cos k t + bk sin k t vk (x, y, z)

(17.102)

of these fundamental vibrational modes. The coefficients ak , bk are uniquely prescribed by


the initial conditions (17.100). Thus,
u(0, x, y, z) =
u
(0, x, y, z) =
t

k=1

ak vk (x, y, z) = f (x, y, z),


k bk vk (x, y, z) = g(x, y, z).

k=1

The explicit formulas follow immediately from the mutual orthogonality of the eigenfunctions:
ZZZ
ZZZ
f vk dx dy dz
g vk dx dy dz
1 h g ; vk i
h f ; vk i

Z
Z
Z
Z
Z
Z
=
=
,
bk =
ak =
.
k v k k2
k k v k k 2
vk2 dx dy dz
k
vk2 dx dy dz

3/7/03

773

c 2003

(17.103)

Peter J. Olver

In the positive semi-definite Neumann boundary value problem, there is an additional


zero eigenvalue 0 = 0 corresponding to the constant null eigenfunction v0 (x, y, z) 1.
This results in two additional terms in the eigenfunction expansion a constant term
ZZZ
1
a0 =
f (x, y, z) dx dy dz
vol

that equals the average initial displacement, and an unstable mode b 0 t that grows linearly
in time, whose speed
ZZZ
1
b0 =
g(x, y, z) dx dy dz
vol

is the average of the initial velocity over the entire body. The unstable mode will be excited
if and only if there is a non-zero net initial velocity, b0 6= 0.
Most of the basic solution techniques we learned in the two-dimensional case apply
here, and we will not dwell on the details. The case of a rectangular box is a particularly
straightforward application of the method of separation of variables, and is outlined in the
exercises. A similar analysis, now in cylindrical coordinates, can be applied to the case of
a vibrating cylinder. The most interesting case is that of a solid spherical ball, which is
the subject of the next subsection.
Vibrations of a Ball
Let us focus on the radial vibrations of a solid ball, as modeled by the three-dimensional
wave equation (17.99). The solution u(t, x) represents the radial displacement of the particle that is situated at position x when the ball is at rest.
For simplicity, we look at the Dirichlet boundary value problem on a ball of radius
1. The normal modes of vibration are governed by the Helmholtz equation (17.101) on
B1 = { k x k < 1 } subject to homogeneous Dirichlet boundary conditions. According to
(17.88), the eigenfunctions are
vk,m,n (r, , ) = Sn (k,n r) Ynm (, ),
vek,m,n (r, , ) = Sn (k,n r) Yenm (, ),

k = 1, 2, 3, . . . ,
m = 0, 1, 2, . . . ,

(17.104)

n = 0, . . . , m.

Here Sn denotes the nth order spherical Bessel function (17.84), k,n is its k th root, while
Ynm , Yenm are the spherical harmonics (17.32). Each eigenvalue
2
k,n = k,n
,

n = 0, 1, 2, . . . ,

k = 1, 2, 3, . . . ,

corresponds to 2 n + 1 independent eigenfunctions, namely


vk,0,n (r, , ), vk,1,n (r, , ), . . . vk,n,n (r, , ), vek,1,n (r, , ), . . . vek,n,n (r, , ),

where we discard the trivial case vek,0,n (r, , ) 0. As a consequence, the fundamental
vibrational frequencies of a solid ball
p
(17.105)
k,n = c k,n = c k,n ,
n = 0, 1, 2, . . . ,
k = 1, 2, 3, . . . ,
3/7/03

774

c 2003

Peter J. Olver

are equal to the spherical Bessel roots k,n multiplied by the wave speed. There are a
total of 2 (2 n + 1) independent vibrational modes associated with each distinct frequency
(17.105), namely
uk,m,n (t, r, , ) = cos(c k,n t) Sn (k,n r) Ynm (, ),
u
bk,m,n (t, r, , ) = sin(c k,n t) Sn (k,n r) Ynm (, ),

u
ek,m,n (t, r, , ) = cos(c k,n t) Sn (k,n r) Yenm (, ),
b
u
ek,m,n (t, r, , ) = sin(c k,n t) Sn (k,n r) Yenm (, ).

k = 1, 2, 3, . . . ,
m = 0, 1, 2, . . . ,

(17.106)

n = 0, . . . , m.

In particular, the radially symmetric modes of vibration have, according to (17.85), the
elementary form
sin k r
,
r
k = 1, 2, 3, . . . . (17.107)
sin k r
u
bk,0,0 (r, , ) = sin c k t S0 (k r) = sin c k t
,
r
Their vibrational frequencies, k,0 = c k , are integral multiples of the lowest freqency
0,1 = . Therefore, interestingly, if you only excite the radially symmetric modes, the
ball would vibrate periodically motion.
More generally, adopting the same scaling argument as in (16.100), we conclude that
the fundamental frequencies for a solid ball of radius R and wave speed c are given by
k,n = c k,n /R. The relative vibrational frequencies
uk,0,0 (r, , ) = cos c k t S0 (k r) = cos c k t

k,n
k,n
k,n
=
=
1,0
1,0

(17.108)

are independent of the size of the ball R or the wave speed c. In the accompanying table,
we display all relative vibrational frequencies that are less than 4 in magnitude. The rows
are indexed by n, the order of the spherical harmonic, while the columns are indexed by
k, the root number.
The purely radial modes of vibration (17.107) have individual frequencies
k,0 =

k c
,
R

so

k,n
= k,
1,0

and appear in the first row of the table. The lowest frequency is 1,0 = c/R, corresponding to a vibration with period 2 /1,0 = 2 R/c. In particular, for the earth, the
radius R 6, 000 km and the wave speed in rock is, on average, c 5 km/sec, so that the
fundamental mode of vibration has period 2 R/c 2400 seconds, or 40 minutes. Vibrations of the earth are also known as seismic waves and, of course, earthquakes are their
most severe manifestation. Therefore understanding the modes of vibration is an issue of
critical importance in geophysics and civil engineering, including the design of structures,
buildings and bridges and the avoidance of resonant frequencies.
Of course, we have suppressed almost all interesting terrestrial geology in this very
crude approximation, which has been based on the assumption that the earth is a uniform
3/7/03

775

c 2003

Peter J. Olver

Relative Spherical Bessel Roots k,n /1,0


0

...

1.0000

2.0000

3.0000

...

1.4303

2.4590

3.4709

1.8346

2.8950

2.2243

3.3159

2.6046

2.9780

3.3463

8
..
.

3.7105
..
.

3.7258
..
.
..
.
..
.

3.9225
..
.
..
.

4.0000
..
.
..
.

body, vibrating only in its radial direction. A more realistic modeling of the vibrations
of the earth requires an understanding of the basic partial differential equations of linear
and nonlinear elasticity, [56]. Nonuniformities in the earth lead to scattering of the resulting vibrational waves. These in turn are used to understand the geological structures
underneath the ground. For instance, setting off and then measuring small scale seismic
vibrations is the primary means of determining its underlying structure, with oil and mineral exploration being a particularly important application. We refer the interested reader
to [6] for a comprehensive introduction to mathematical seismology.
Remark : The number of spherical harmonics governs the energy levels or orbital shells
occupied by electrons in an atom. In chemistry, the electron levels are indexed by order n
of the spherical harmonic, and traditionally labeled by a letter in the sequence p, s, d, f, . . ..
Thus, the order n = 0 spherical harmonics correspond to the p shells; the 3 harmonics of
order n = 1 are the s shells, and so on. Since electrons are allowed to have one of two
possible spins, the Pauli exclusion principle tells us that each energy shell can be occupied
by at most two electrons. Thus, the number of electrons that can reside in the n th energy
level of an atom is 2(2 n + 1), the same as the number of linearly independent solutions to
the wave equation associated with a given energy level. The configuration of energy shells
and electrons in atoms are responsible for the periodic table. Thus, hydrogen has a single
electron in the p shell. Helium has two electons in the p shell. Lithium has 3 electrons,
with two of them filling the first p shell and the third in the second p shell. Neon has 10
electrons filling the two p and first three s shells. And so on. The chemical properties
of the elements are, to a very large extent, determined by the placement of the electrons
within the different shells. See [Chem] for further details.
Example 17.15. The radial vibrations of a hollow spherical shell (e.g., an elastic
3/7/03

776

c 2003

Peter J. Olver

balloon) are governed by the differential equation


2

u
1 2u
u
2
2
+ cot
+
utt = c S [ u ] = c
,
2
sin2 2

(17.109)

where S denotes the spherical Laplacian (17.17). The radial displacement u(t, , ) of a
point on the sphere only depends on time t and the angular coordinates , . The solution
u(t, , ) is required to be 2 periodic in the meridial angle and bounded at the poles
= 0, .
According to (17.32), the nth eigenvalue n = n(n+1) of the spherical Laplacian leads
to 2 n + 1 linearly independent spherical harmonic eigenfunctions
Yn0 (, ), Yn1 (, ), . . . , Ynn (, ), Yen1 (, ), . . . , Yenn (, ).

As a consequence, the fundamental frequencies of vibration for a spherical shell are


p
p
(17.110)
n = 0, 1, 2, . . . .
n = c n = c n(n + 1) ,

The vibrational solutions are quasi-periodic combinations of the fundamental modes


p
p
cos n(n + 1) t Ynm (, ),
sin n(n + 1) t Ynm (, ),
(17.111)
p
p
sin n(n + 1) t Yenm (, ),
cos n(n + 1) t Yenm (, ),

involving the spherical harmonics.


The smallest positive eigenvalue is 1 = 2, yielding a
lowest tone of frequency 1 = c 2. The higher order frequencies are irrational multiples
of the lowest order one, 1 = c, and hence a spherical bell sounds percussive to our ears.
The spherical Laplacian operator is only positive semi-definite, since the lowest mode
has eigenvalue 0 = 0, which corresponds to the constant null eigenfunction v0 (, ) =
Y00 (, ) 1. Therefore, the wave equation admits an unstable mode b0,0 t, corresponding
to a uniform radial inflation. The coefficient
ZZ
3
u
(0, , ) dS
b0,0 =
4
S1 t
represents the spheres average initial velocity. The existence of such an unstable mode is
an artifact of the simplified linear model we are using, that fails to account for nonlinearly
elastic effects that serve to constrain the inflation of a spherical balloon.

17.6. Spherical Waves and Huygens Principle.


The fundamental solution to the wave equation measures the effect of applying an
instantaneous concentrated unit impulse at a single point. Two physical examples to keep
in mind are the light waves propagating from a sudden concentrated blast, e.g., a stellar
supernova or a lightning bolt, and the sound waves from an explosion or thunderclap,
propagating in air at a much slower speed.
In a uniform isotropic medium, e.g., empty space, the initial blast leads to a spherically
expanding wave, moving away at the speed of light or sound in all directions. Using
translation invariance, we can assume that the source is at the origin, and so the solution
3/7/03

777

c 2003

Peter J. Olver

u(t, x) should only depend on the distance r = k x k from the source. We change over
to spherical coordinates and look for a solution u = u(t, r) to the three-dimensional wave
equation with no angular dependence. Substituting the formula (17.14) for the spherical
Laplacian and setting the angular derivatives to 0, we are led to the partial differential
equation

2
2u
u 2 u
2
(17.112)
=c
+
t2
r2
r r
that governs the propagation of spherically symmetric waves. It turns out, surprisingly,
that we can solve this partial differential equation. The secret is to multiply both sides of
the equation by r. The resulting equation can be written in the form

2
2u
2(
2u
u
2
2
r
=
c
u)t
=
r
=
c
+
2
(r u),
r2
t2
r2
r
r2
and so (17.112) reduces to
2
2w
2 w
=c
,
t2
r2

where

w(t, r) = r u(t, r).

(17.113)

Therefore the function w(t, r) is a solution to the one-dimensional wave equation!


According to Theorem 13.7, the general solution to (17.113) has the dAlembert form
w(t, r) = p(r c t) + q(r + c t),
where p() and q() are arbitrary functions of a single characteristic variable. Reverting
back to u = w/r, we conclude that the spherically symmetric solutions to the threedimensional wave equation are all of the form
u(t, r) =

p(r c t)
q(r + c t)
+
.
r
r

The first term


u(t, r) =

(17.114)

p(r c t)
r

(17.115)

in the solution (17.114) represents a wave moving at speed c in the direction of increasing
r away from the origin. It describes the effect of a variable light source concentrated
at the origin. Think, for instance, of a pulsating quasar in interstellar space. To highlight
this interpretation, let us look at the basic case when p(s) = (s a) be a delta function
at s = a; more general such solutions can then be assembled by linear superposition. The
solution

r c(t t0 )
(r c t a)
a
u(t, r) =
=
,
where
t0 = .
(17.116)
r
r
c
will represent a concentrated spherical wave. At the instant t = t0 , the light is entirely
concentrated at the origin r = 0. The light impulse then moves away from the origin at
speed c in all directions. At each later time t > t0 , the initially concentrated light source
is now spread out over the surface of a sphere of radius r = c (t t0 ). The intensity of
3/7/03

778

c 2003

Peter J. Olver

the signal at each point on the sphere, however, has decreased by a factor 1/r, and so, the
farther from the source, the weaker the signal. An observer sitting at a fixed point away
from the source will only see an instantaneous flash of light as the spherical wave passes
by. A similar phenomenon holds for sound waves the sound of the explosion will only
last momentarily. Thunder and lightning are the most familiar examples of this everyday
phenomenon. On the other hand, for t < t0 , the impulse is concentrated at a negative
radius r = c (t t0 ) < 0. To interpret this, note that, for a given value of the spherical
angles , the point
x = r sin cos ,

y = r sin sin ,

z = r cos ,

for r < 0 lies on the antipodal point of the sphere of radius | r |, so that replacing r by
r has the same effect as changing x to x. Thus, the solution (17.116) represents a
concentrated spherically symmetric light wave arriving from the edges of the universe at
speed c, that strengthens in intensity as it collapses into the origin at t = t 0 . After collapse,
it immediately reappears in expanding form.
The second solution in the dAlembert formula (17.114) has, in fact, exactly the same
physical form. Indeed, if we set
rb = r,

p() = q( ),

then

q(r + c t)
p(b
r c t)
=
.
r
rb

Therefore, to represent the general radially symmetric solution to the three-dimensional


wave equation, we only need use one of these constituents, and thus only need to consider
solutions of the form (17.116) from now on.
In order to utilize such spherical wave solutions, we need to understand the nature of
their originating singularity. For simplicity, we set a = 0 in (17.116) and concentrate on
the particular solution
(r c t)
,
(17.117)
u(t, r) =
r
which has a singularity at the origin r = 0 when t = 0. We need to pin down precisely
which sort of distribution this solution represents. Invoking the limiting definition of a
distribution is tricky, and it will be easier to use the dual definition as a linear functional.
Thus, at a fixed time t 0, we must evaluate the inner product
ZZZ
u(t, x, y, z) f (x, y, z) dx dy dz
hu;f i =
of the solution with a smooth test function f (x) = f (x, y, z). We convert to spherical
coordinates using the change of variables formula (B.66), whereby
hu;f i =

= ct
3/7/03

Z 2 Z

0
0
2 Z

(r c t)
f (r, , ) r 2 sin d d dr
r

(17.118)

f (c t, , ) sin d d.

779

c 2003

Peter J. Olver

Therefore, h u ; f i = 4 c t M0c t [ f ], where


ZZ
Z 2 Z
1
1
0
Mc t [ f ] =
f dS =
f (c t, , ) sin d d
4 c 2 t2
4 0 0
Sc t

(17.119)

is the mean or average value of the function f on the sphere Sc t = k x k = c t of radius


r = c t centered at the origin 0. In particular, in the limit as t 0, the mean over the
sphere of radius r = 0 is equal to the value of the function at the origin:
M00 [ f ] = f (0),

(17.120)

and so, at t = 0, the formula implies that h u ; f i = 0 for all functions f . Consequently,
u(0, r) 0 represents a trivial zero initial displacement.
How, then, can the solution be nonzero? Clearly, this must be the result of a nonzero
initial velocity. Thus, we differentiate (17.118) with respect to t, whereby

u
;f
t

hu;f i
t
Z 2 Z
Z 2 Z
f
2
=c
(c t, , ) sin d d
f (c t, , ) sin d d + c t
0
0
0
0 r

f
0
2
0
.
(17.121)
= 4 c M c t [ f ] + 4 c t Mc t
r

The result is a linear combination of the mean of f and of its radial derivative f r over the
sphere of radius c t. In particular, at t = 0, using (17.120),

h ut ; f i t=0 = 4 c M00 [ f ] = 4 c f (0),


We conclude that, at t = 0, the initial velocity

ut (0, r) = 4 c (x)
is a multiple of a delta function at the origin! Dividing through by 4 c, we conclude that
the spherical expanding wave
(r c t)
(17.122)
u(t, r) =
4 c r
is the solution to the initial value problem
u
(0, x) = (x),
t
corresponding to an initial unit velocity impulse concentrated at the origin. This solution
can be viewed as the three-dimensional version of striking a piano string with a hammer.
More generally, if our unit impulse is concentrated at the point , we invoke the
translational symmetry of the wave equation to conclude that the function

kx k ct
(17.123)
G(t, x; ) =
,
t 0,
4 c k x k
u(0, x) 0,

3/7/03

780

c 2003

Peter J. Olver

is the solution to the wave equation that satisfies the initial conditions
G
(0, x; ) = (x ).
t

G(0, x; ) = 0,

(17.124)

By linearity, any superposition of these spherical waves will also be a solution to the wave
equation. Thus, for the initial conditions
u
(0, x, y, z) = g(x, y, z),
t

u(0, x, y, z) = 0,

(17.125)

representing a zero initial displacement, we write the initial velocity


ZZZ
g(x) =
g() (x ) dx dy dz
as a superposition of delta functions, and immediately conclude that the relevant solution
is the identical superposition of spherical waves

ZZZ
ZZ
kx k ct
1
1
u(t, x) =
d d d =
g()
g() dS.
4 c
kx k
4 c2 t
k x k=c t
(17.126)
Therefore the value of the solution at a point x and time t 0 is equal to
u(t, x) = t Mxc t [ g ] ,

(17.127)

namely t times the mean of the initial velocity function g over a sphere of radius r = c t
centered at the point x.
Example 17.16. Let us set the wave speed c = 1 for simplicity. Suppose that the
initial velocity

1,
k x k < 1,
g(x) =
0,
kxk > 1
is 1 within the unit ball B1 centered at the origin, and 0 outside the ball. According to
the formula (17.126) for the solution at a point x and time t 0, we need to compute
the average value of g over a sphere Stx of radius t centered at x. Since g = 0 outside the
unit sphere, its average will be equal to the surface area of that part of the sphere that is
contained inside the unit ball, i.e., Stx B1 , divided by the total surface area of Stx , namely
4 t2 . The two spheres will intersect if either
(a) r > 1 and r 1 < t < r + 1,

or

(b) r < 1 and 1 r < t < 1 + r.

If t > 1 + r or t < r 1 and r > 1, then the sphere of radius t lies entirely outside the unit
ball, and so the mean is 0; if t < 1 r and r < 1, then the sphere lies entirely within the
unit ball and the mean is 1. Otherwise, referring to Figure bs , and using Exercise , we
see that the area of the spherical cap Stx B1 is, by the Law of Cosines,

1 r 2 t2
t
2
2
1
2 t (1 cos ) = 2 t
=
[ 1 (t r)2 ] ,
2rt
2r
3/7/03

781

c 2003

Peter J. Olver

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4

0.2

0.2

0.2

0.5

1.5

2.5

0.5

r = .3
Figure 17.3.

1.5

2.5

r = .7

0.5

1.5

2.5

r = 1.3

Time Plot of the Solution to Wave Equation at Three Fixed Positions.

where r = k x k and = denotes the azimuthal angle describing the circle of intersection
between the two spheres. Therefore,
1,

1 (t r)2
x
Mc t [ g ] =
,

4rt

0,

0 t 1 r,
(17.128)

| r 1 | t 1 + r,
0 t r 1 or

t 1 + r.

The solution (17.127) is obtained by multiplying by t, and hence for t 0,

t,
0 t 1 k x k,

1 t kxk
u(t, x) =
,
| 1 k x k | t 1 + k x k,

4kxk

0,
0 t k x k 1 or t 1 + k x k.

(17.129)

As illustrated in Figure 17.3, an observer sitting inside the sphere at a distance r < 1 away
from the origin will experience a linearly increasing light intensity followed by a parabolic
decrease to 0 intensity, where it remains from then on. If the observer is closer to the
edge than the center, the parabolic portion will continue to increase for a while before
eventually tapering off. On the other hand, an observer sitting outside the sphere will
experience, after an initially dark period, a parabolic increase to a maximal intensity and
then symmetrical decrease, returning to dark after a total time laps of 2. We also show a
plot of u at a function of r = k x k for various times in Figure wbr . Note that the light
stays brightest in a sphere of gradually decreasing radius. At time t = 1 there remains
a cusp, after which the solution is bright inside the domain lying between two concentric
spheres of respective radii t 1 and t + 1.
The solution described by formula (17.126) only handles initial velocities. How do
we construct a solution corresponding to a nonzero initial displacement? Surprisingly, the
answer is differentiation. The key observation is that if u(t, x) is any (sufficiently smooth)
solution to the wave equation, so is its time derivative
v(t, x) =

u
(t, x).
t

This follows at once from differentiating both sides of the wave equation with respect to t
and using the equality of mixed partial derivatives. Physically, this implies that the velocity
of a wave obeys the same evolutionary principle as the wave itself, which is a manifestation
3/7/03

782

c 2003

Peter J. Olver

of the linearity and time-independence (autonomy) of the equation. Suppose u has initial
conditions
u(0, x) = f (x),
ut (0, x) = g(x).
What are the initial conditions for its derivative v = ut ? Clearly, its initial displacement
v(0, x) = ut (0, x) = g(x) equals the initial velocity of u. As for its initial velocity, we have
v
2u
= 2 = c2 u
t
t
because we are assuming that u solves the wave equation. Thus, at the initial time
v
(0, x) = c2 u(0, x) = c2 f (x)
t
equals c2 times the Laplacian of the initial displacement . In particular, if u satisfies the
initial conditions
u(0, x) = 0,
ut (0, x) = g(x),
(17.130)
then v = ut satisfies the initial conditions
v(0, x) = g(x),

vt (0, x) = 0.

(17.131)

Thus, paradoxically, to solve the initial displacement problem we differentiate the initial
velocity solution (17.126) with respect to t, and hence

g

u
x
x
x
t Mc t [ g ] = Mc t [ g ] + c t M c t
(t, x) =
,
(17.132)
v(t, x) =
t
t
n

using our computation in (17.121). Therefore, v(t, x) is a linear combination of the mean
of the function g and the mean of its normal or radial derivative g/n, taken over a sphere
of radius c t centered at the point x. In particular, to obtain the solution corresponding to
a concentrated initial displacement,
F
(0, x; ) = 0,
t

(17.133)

0 kx k ct
G
F (t, x; ) =
(t, x; ) =
,
t
4 k x k

(17.134)

F (0, x; ) = (x ),
we differentiate the solution (17.123), so

which represents a spherically expanding doublet, cf. Figure 10.10. Thus, interestingly, a
concentrated initial displacement spawns a spherical doublet or derived delta wave, whereas
a concentrated initial velocity spawns a singlet delta wave.

A similar device is used to initiate the numerical solution method for the wave equation; see
Section 13.5.

3/7/03

783

c 2003

Peter J. Olver

0.8

0.8

0.8

0.6

0.6

0.6

0.4

0.4

0.4

0.2

0.2

0.2

0.5

1.5

2.5

r = .3
Figure 17.4.

0.5

1.5

2.5

r = .7

0.5

1.5

2.5

r = 1.3

Time Plot of Solutions to Wave Equation at Three Fixed Position.

Example 17.17. Let c = 1 for simplicity. Consider the initial displacement

1,
k x k < 1,
u(0, x) = f (x) =
0,
kxk > 1

along with zero initial velocity, corresponding to an intantaneously illuminated solid glass
ball. To obtain the solution, we try differentiating (17.129) with respect to t, leading to

1,
0 t < 1 k x k,

kxk t
,
| 1 k x k | t 1 + k x k,
u(t, x) =
(17.135)

2kxk

0,
0 t < k x k 1 or t > 1 + k x k.

As illustrated in Figure 17.4, an observer sitting inside the ball at radius r < 1 will begin
by experiencing a constant intensity, followed by a sudden jump, then linear decrease,
and finally a jump back to quiescent, while an observer sitting outside, with r > 1, will
experience, after an initially dark period, a sudden jump in the light intensity, followed by
a linear decrease to darkness. The size of the jump depends upon the distance from the
ball.

By linearity, we can combine the two solutions (17.127), (17.132) together, and have
thus established a dAlembert-type solution formula for the wave equation in three-dimensional
space.
Theorem 17.18. The solution to the initial value problem
u
(0, x) = g(x),
x R3,
t
for the wave equation in three-dimensional space is given by

f
x
x
u(t, x) = Mc t [ f ] + c t Mc t
+ t Mxc t [ g ] ,
n
utt = c2 u,

u(0, x) = f (x),

(17.136)

(17.137)

where Mxc t [ f ] denotes the average value of the function f over a sphere of radius c t
centered at position x.
Observe that the value of the solution (17.137) at a point x and time t only depends
upon the values of the initial displacements and velocities at a distance c t away. Physically,
this means that the light that we see at a given time t arrived from points at a distance
exactly d = c t away at time t = 0. In particular, a sharp, localized initial signal whether
3/7/03

784

c 2003

Peter J. Olver

initial displacement or initial velocity that is concentrated near a point produces a sharp,
localized response concentrated on a sphere surrounding the point at all subsequent times.
In our three-dimensional universe, we only witness the light from an explosion for a brief
moment, after which if there is no subsequent light source, the view returns to darkness.
Similarly, a sharp sound remains sharply concentrated, with diminshing magnitude, as it
propagates through space. This phenomenon was first highlighted the seventeenth century
Dutch scientist Christiaan Huygens and is known as Huygens Principle in his honor.
Remarkably, as we will show next, Huygens Principle does not hold in a two dimensional
universe! In the plane, concentrated impulses will be spread out as time progresses.
The Method of Descent
So far, we have explicitly determined the response of the wave equation to an initial
displacement and initial velocity in one- and three-dimensional space. The two-dimensional
case
utt = c2 u = c2 (uxx + uyy ).
(17.138)
is, counter-intuitively, more complicated! For instance, looking for a radially symmetric
solution u(t, r) leads to the partial differential equation
2

u 1 u
2u
2
=c
+
(17.139)
t2
r2
r r
which, unlike its three-dimensional cousin (17.112), cannot be so easily integrated.
However, our solution to the three-dimensional problem can be easily adapted to
construct a solution using the so-called method of descent. Any solution u(t, x, y) to
the two-dimensional wave equation (17.138) can be viewed as a solution to the threedimensional wave equation (17.99) that does not depend upon the vertical z coordinate,
whence u/z = 0. Clearly, if the initial data does not depend on z, then the resulting
solution u(t, x, y) will also be independent of z.
Consider first the solution formula (17.126) corresponding to initial conditions
u
(0, x, y) = g(x, y).
t

u(0, x, y) = 0,

(17.140)

of zero initial displacement, but nonzero initial velocity. We


rewrite the formule in the
form of a surface integral over the sphere Sc t = k k = c t centered at the origin:
ZZ
ZZ
1
1
g() dS =
g(x + ) dS.
u(t, x) =
(17.141)
4 c2 t
4 c2 t
Sc t
k k=c t
Imposing the condition that g(x, y) does not depend upon the z coordinate, we see that
the integrals over the upper and lower hemispheres

Sc+t = k k = c t, 0 ,
Sct = k k = c t, 0 ,

are identical. As in (B.45), to evaluate the upper


p hemispherical integral, we parametrize the upper hemisphere as the graph =
(c t)2 2 2 over the disk Dc t =
3/7/03

785

c 2003

Peter J. Olver

2 + 2 c2 t2 , and so
ZZ
ZZ
1
1
g(x + , y + )
p
u(t, x, y) =
d d,
g(x + ) dS =
2
2 c t
2 c
(c t)2 2 2
Sc+t
Dc t

(17.142)

which solves the initial value problem (17.140). In particular, if we take the initial velocity
g(x, y) = (x ) (y ) to be a concentrated impulse, then the resulting solution is

(x )2 + (y ) > c t,
0,
1
G(t, x, y; , ) =
p
,
(x )2 + (y ) < c t.

2
2
2
2
2 c c t (x ) (y )
(17.143)
Thus, given a concentrated impulse in the velocity at time t = 0, an observer sitting at
position x will first experience a concentrated light wave at time t = k x k/c. However,
in contrast to the three-dimensional solution, the observer will continue to experience a nonzero signal after the initial distrubance has passed, with decreasing magnitude proportional
to 1/(c t); see the first graph in Figure nhp2 . Thus, although the initial condition is
concentrated, in contrast to the three-dimensional case, the resulting solution is not. In
a two-dimensional universe, Huygens principle is not valid. A two-dimensional creature
would experience not only a initial effect of any sound or light wave but also an afterglow
with slowly diminishing magnitude. It would be like living in a permanent echo chamber,
and so understanding and acting upon sensory phenomena would more challenging in a
two-dimensional universe. In general, Huygens principle is only valid in odd-dimensional
spaces; see also [13] for recent advances in the classification of partial differential equations
that admit a Huygens principle.
Similarly, the solution to the initial displacement conditions
u
(0, x, y) = 0,
t
can be obtained by differentiation with respect to t. Thus,

!
ZZ
1
f (x + , y + )

p
u(t, x, y) =
d d
t 2 c
(c t)2 2 2
Dc t
u(0, x, y) = f (x, y),

(17.144)

(17.145)

is the desired solution. The general solution is a linear combination of the two types of
solutions (17.142), (17.145). Note that the solution at a point x at time t depends upon
the initial displacement and velocity on the entire disk of radius r t centered at the point,
and not just on the points a distance c t away.
Remark : Since the solutions to the two-dimensional wave equation can be interpreted
as three-dimensional solutions with no z dependence, a concentrated delta impulse in the
two-dimensional wave equation would correspond to a concentrated line impulse in three
dimensions. If light starts propagating from the line at t = 0, after the initial signal reaches
us, we will continue to receive light from points that are progressively farther away along
the line, which accounts fo the two-dimensional afterglow.
3/7/03

786

c 2003

Peter J. Olver

Chapter 18
Nonlinear Systems
Nonlinearity is ubiquitous in physical phenomena. Fluid mechanics, elasticity, relativity, chemical reactions, combustion, ecology, biomechanics, and many, many others are
all governed by inherently nonlinear equations. (The one notable exception is quantum
mechanics, which is a fundamentally linear theory. More recent attempts at grand unification of all fundamental physical theories, such as string theory and conformal field theory,
do venture into the nonlinear realm.) For this reason, an increasingly large fraction of
modern mathematical research is devoted to the analysis of nonlinear systems. The advent of powerful computers has finally placed nonlinearity within our grasp, and thereby
fomented a revolution in our understanding and development of nonlinear mathematics.
Indeed, many of the most important modern analytical techniques drew their inspiration
from early computer forays into the uncharted nonlinear wilderness.
Why, then, have we spent the overwhelming majority of this text developing purely
linear mathematics? The facile answer, of course, is that nonlinear systems are vastly more
difficult to analyze. In the nonlinear regime, many basic questions remain unanswered;
existence and uniqueness of solutions are not guaranteed; explicit formulae are difficult
to come by; linear superposition is no longer available; numerical approximations are not
always sufficiently accurate; etc., etc. But, a more intelligent answer is that, without a
proper understanding of linear phenomena and linear mathematics, one has no foundation
upon which to erect a nonlinear analysis. Therefore, in an introductory text on applied
mathematics, we are forced to develop in detail the proper linear foundations to aid us
when we confront the nonlinear beast.
Moreover, many important physical systems are weakly nonlinear, in the sense that,
while nonlinear effects do play an essential role, the linear terms dominate the system, and
so, to a first approximation, the system is close to linear. As a result, the underlying nonlinear phenomena can be understood by suitably perturbing their linear approximations.
Historically, while certain nonlinear problems date back to Newton (for example the n
body problem arising in celestial mechanics and planetary motion), significant progress in
understanding weak nonlinearities only began after computers became sufficiently powerful tools. The truly nonlinear regime is, even today, only sporadically modeled and even
less well understood. Despite dramatic advances in both hardware and mathematical algorithms, many nonlinear systems, for instance Einsteinian gravitation, still remain beyond
the capabilities of todays computers and algorithms.
Space limitations imply that we can only provide a brief overview of some of the key
ideas and phenomena that arise when venturing into the nonlinear realm. This chapter is
devoted to the study of nonlinear functions and equations. In the remaining chapters, we
3/7/03

787

c 2003

Peter J. Olver

shall ascend the nonlinear dimensional ladder, passing from equilibrium to dynamics and
from discrete to continuous, mimicking our linear ascent that guided the logical progression
in the preceding chapters of the text.
We begin with an analysis of the iteration of nonlinear functions. Building on our
experience with iteration of linear systems, we will discover that functional iteration, when
it converges, provides a powerful mechanism for solving equations and optimization. When
it fails to converge, even very simple nonlinear iterations can lead to remarkably complex,
chaotic behavior. The second section is devoted to basic solution techniques for nonlinear
systems, and includes the bisection method, iterative methods, and the powerful Newton
method. The third section is devoted to optimization, i.e., the minimization of nonlinear
functions on finite-dimensional spaces. As we know, the equilibrium configurations of
discrete mechanical systems are minimizers of the potential energy in the system. The
pointw where the gradient of the function vanishes are the critical points, and include the
local minima and maxima as well as non-optimizing saddle points. Nondegenerate critical
points are classfied by a second derivative test based on Hessian matrix. These results from
multivariable calculus will be developed in a form that readily generalizes to minimization
problems on infinite-dimensional function space, to be presented in Chapter 20. Numerical
optimization procedures rely on iterative procedures, and we present those connected with
a gradient descent approach.

18.1. Iteration of Functions.


Iteration, or repeated application of a function, plays an essential role in the modern
theories of dynamical systems. Iteration can be regarded as a discrete dynamical system,
in which the continuous time variable has been quantized. Even iterating a very simple quadratic function leads to an amazing variety of phenomena, including convergence,
period doubling, and chaos. Discrete dynamical systems arise not just in mathematics,
but also underly the theory of growth and decay of biological populations, predator-prey
models, spread of communicable diseases such as Aids, and host of other natural phenomena. Moreover, many numerical solution methods for systems of algebraic equations,
ordinary differential equations, partial differential equations and so on rely in essence
on an iterative method, and so the basic results on function iteration play a key role in
the analysis of cojnvergence and efficiency of such numerical techniques.
In general, an iterative system of the form
u(k+1) = g(u(k) ),

(18.1)

is also known as a discrete dynamical system. A solution is a discrete collection of points


u(k) in which the index k = 0, 1, 2, 3, . . . takes on non-negative integer values. One might
also consider negative integral values k = 1, 2, . . . of the index, but we will not. The
superscripts on u(k) refer to the iteration number, and do not denote derivatives. The
index k may be viewed as the discrete time for the system, indicating the number of
days, years, seconds, etc.
The function g: R n R n is usually assumed to be continuous. Later on we shall also

Complex iteration is based on a complex-valued function g: C n C n .

3/7/03

788

c 2003

Peter J. Olver

require that g be reasonably smooth, meaning that it has at least one or two continuous
partial derivatives everywhere. Chapter 9 dealt with the case when g(u) = A u is a linear
function, necessarily given by multiplication by an n n matrix A. In this chapter, we
allow nonlinear functions into the picture.
Once we specify an initial condition, say
u(0) = c,

(18.2)

for the initial iterate, then the solution is easy to compute mechanically:
u(1) = g(u(0) ) = g(c), u(2) = g(u(1) ) = g(g(c)), u(3) = g(u(2) ) = g(g(g(c))), . . .
and so on. Therefore, unlike continuous dynamical systems, existence and uniqueness
of solutions is immediate. As long as each successive iterate u(k) lies in the domain of
definition of g one merely repeats the process to produce the solution,
u(k) = g g

g(c),

(18.3)

k = 0, 1, 2, . . . ,

which is obtained by composing the function g with itself a total of k times. In other
words, the solution to a discrete dynamical system corresponds to reatedly pushing the
g key on your calculator. For example, repeatedly hitting the sin key corresponds to a
solution to the system u(k+1) = sin u(k) . For simplicity, we shall tacitly assume that the
function g is defined on all of R n . Otherwise, we must always be careful that the successive
iterates u(k) never leave the domain of definition of g, which would cause the iteration to
break down.
While the solution to a discrete dynamical system is essentially trivial, understanding
its behavior is definitely not. Sometimes the solution converges to a particular value
the key requirement for numerical solution methods. Sometimes it goes off to , or,
more precisely, k u(k) k . Sometimes the solution repeats itself after a while. And
sometimes it behaves in a random, chaotic manner all depending on the function g and,
at times, the initial condition c. Although any of these cases may appear and play a role
in applications, we shall mostly concentrate upon understanding the case of convergence
of the iterates.
Definition 18.1. A fixed point or equilibrium solution for a discrete dynamical
system (18.1) is a vector u? R n such that
g(u? ) = u? .

(18.4)

We easily see that every fixed point provides a constant solution, namely u (k) u? ,
to the discrete dynamical system. Moreover, solutions that converge always converge to a
fixed point.
Proposition 18.2. If a solution to a discrete dynamical system converges,
lim u(k) = u? ,

then the limit u? is a fixed point of the system.

3/7/03

789

c 2003

Peter J. Olver

Proof : This is a simple consequence of the continuity of g. We have

?
(k+1)
(k)
(k)
= g(u? ),
u = lim u
= lim g(u ) = g lim u
k

the last two equalities following from the continuity of g.

Q.E.D.

Of course, not every solution to a discrete dynamical system will necessarily converge,
but Proposition 18.2 says that if it does, then it must converge to a fixed point. Thus, the
goal is to understand when a solution converges, and, if so, to which fixed point if there
is more than one. (In the linear case, only the actual convergence is a significant issues
since most linear systems admit exactly one fixed point, namely u? = 0.) Fixed points are
roughly divided into three classes: asymptotically stable, with the property that all nearby
solutions converge to it, stable, with the property that all nearby solutions stay nearby, and
unstable, almost all of whose nearby solutions diverge away from the fixed point. Thus,
from a practical standpoint, convergence of the iterates of a discrete dynamical system
requires asymptotic stability of the fixed point.
Scalar Functions
As always, the first step is to thoroughly understand the scalar case, and so we begin
with a discrete dynamical system
u(k+1) = g(u(k) ),

u(0) = c,

(18.5)

in which g: R R is a continuous, scalar-valued function. As noted above, we will assume,


for simplicity, that g is defined everywhere, and so the iterates u(0) , u(1) , u(2) , . . . are all
well-defined.
The linear case g(u) = a u was treated in Section 9.1, following (9.2). The simplest
nonlinear case is that of an affine function
g(u) = a u + b,

(18.6)

leading to an affine discrete dynamical system


u(k+1) = a u(k) + b.

(18.7)

The only fixed point is the solution to


u? = g(u? ) = a u? + b,

namely,

u? =

b
.
1a

(18.8)

The formula for u? requires that a 6= 1, and, indeed, the case a = 1 has no fixed point, as
the reader can easily confirm; see Exercise . Since we already know the value of u ? , we
can easily analyze the difference
e(k) = u(k) u? ,

(18.9)

between the iterate u(k) and the fixed point. The smaller e(k) is, the closer u(k) is to the
desired fixed point. In many applications, the iterate u(k) is viewed as an approximation
3/7/03

790

c 2003

Peter J. Olver

to the fixed point u? , and so e(k) is interpreted as the error in the k th iterate. Subtracting
the fixed point equation (18.8) from the iteration equation (18.7), we find
u(k+1) u? = a (u(k) u? ).

Therefore the errors e(k) satisfy a linear iteration


e(k+1) = a e(k) ,

e(k) = ak e(0) .

and hence

(18.10)

Therefore, as we already demonstrated in Section 9.1, the solutions to this scalar linear
iteration converge,
e(k) 0

and hence

u(k) u? ,

if and only if

| a | < 1.

This is the criterion for asymptotic stability of the fixed point, or, equivalently, convergence
of the affine iterative system (18.7). The magnitude of | a | < 1 determines the rate of
convergence, and the closer it is to 0, the faster the iterates approach to the fixed point.
Example 18.3. Suppose g(u) =

1
4

u + 2, and so we consider the iterative scheme

u(k+1) =

1
4

u(k) + 2.

Starting with the initial condition u(0) = 0, the ensuing values are
k

u(k)

2.0

2.5

2.625

2.6562

2.6641

2.6660

2.6665

2.6666

Thus, after 8 iterations, the iterates have converged to the fixed point u ? = 38 to 4 decimal
places. The rate of convergence is 14 , and indeed
k
k
| u(k) u? | = 41 (u(0) u? ) = 38 41
0
as
k .

Let us now turn to the fully nonlinear case. In general, near a given point, any
(smooth) nonlinear function can be approximated by its tangent line, which is an affine
function; see Figure tline . Therefore, if we are close to a fixed point u ? , then we might
expect the behavior of the nonlinear system will behave very much like iteration of its
affine approximation. And, indeed, this intuition turns out to be essentially correct. This
result forms our first concrete example of linearization, in which the analysis of a nonlinear
system is based on its linear (or, more correctly, affine) approximation.
The explicit formula for the tangent line to g(u) near the fixed point u = u ? is
g(u) g(u? ) + g 0 (u? )(u u? ) = a u + b,
where
a = g 0 (u? ),

(18.11)

b = g(u? ) g 0 (u? ) u? = 1 g 0 (u? ) u? .

Note that u? = b/(1 a) remains a fixed point for the affine approximation. According
to the preceding discussion, the covergence of the iterates for the affine approximation
is governed by the size of the coefficient a = g 0 (u? ). This observation inspires the key
stability criterion for fixed points of scalar iterative systems.
3/7/03

791

c 2003

Peter J. Olver

Theorem 18.4. Suppose g(u) is a continuously differentiable scalar function. Suppose u? = g(u? ) is a fixed point. If | g 0 (u? ) | < 1, then u? is a stable fixed point, and hence
any sequence of iterates u(k) which starts out sufficiently close to u? will converge to u? .
On the other hand, if | g 0 (u? ) | > 1, then u? is an unstable fixed point, and the only iterates
which converge to it are those that land exactly on it, i.e., u(k) = u? for some k 0.
Proof : The goal is to prove that the errors e(k) = u(k) u? between the k th iterate
and the true fixed point tend to 0 as k . To this end, we try to estimate e (k+1) in
terms of e(k) . According to (18.5) and the Mean Value Theorem C.3 from calculus,
e(k+1) = u(k+1) u? = g(u(k) ) g(u? ) = g 0 (v) (u(k) u? ) = g 0 (v) e(k) ,

(18.12)

for some v lying between u(k) and u? . By continuity, if | g 0 (u? ) | < 1 at the fixed point,
then we can choose 0 < < 1 such that
| g 0 (v) | < 1

whenever

| v u? | <

(18.13)

holds in a (perhaps small) interval surrounding the fixed point. If | e (k) | = | u(k) u? | < ,
then the point v in (18.12) satisfies (18.13). Therefore,
| u(k+1) u? | | u(k) u? |,

and hence

| e(k+1) | | e(k) |.

(18.14)

In particular, since < 1, if | u(k) u? | < , then | u(k+1) u? | < , and hence the subsequent iterate u(k+1) also lies in the interval where (18.13) holds. Iterating, we conclude
that the errors satisfy
e(k) k e(0) ,

and hence

e(k) = | u(k) u? | 0 as

k ,

(18.15)

which completes the proof of the theorem in the stable case. The proof in unstable case is
left as Exercise for the reader.
Q.E.D.
Remark : The borderline cases g 0 (u? ) = 1 are not covered by the theorem. For a
linear system, these cases are stable, but not asymptotically stable. For nonlinear systems,
such borderline situations require more detailed knowledge of the nonlinear terms in order
to resolve the status stable or unstable of the fixed point. Despite their importance
in certain applications, we will not try to analyze such borderline cases any further here.
From now on, we will only deal with asymptotically stable fixed points, and, for brevity,
usually omit the adjective asymptotically.
Example 18.5. Given constants , m, the trigonometric equation
u = m + sin u

(18.16)

is known as Keplers equation. It arises in the study of planetary motion, with | | < 1
representing the eccentricity of an elliptical planetary orbit and m its mean anomaly; see
Figure Kepler . The desired solution u is the eccentric anomaly, and governs the motion
of the planet around the ellipse. Details can be found in [63; p. 119].
The solutions to Keplers equation are the fixed points of the discrete dynamical
system based on the function g(u) = m + sin u. Note that
| g 0 (u) | = | cos u | = | | < 1,
3/7/03

792

(18.17)
c 2003

Peter J. Olver

which automatically implies that the as yet unknown fixed point is stable. Indeed, Exercise
implies that condition (18.17) is enough to prove the existence of a unique stable fixed
point. In the particular case m = = 21 , the result of iterating u(k+1) = 12 + 12 sin u(k)
starting with u(0) = 0 is
k

u(k)

0.5

0.7397

0.8370

0.8713

0.8826

0.8862

0.8873

0.8877

0.8878

After 13 iterations, we have converged sufficiently close to the solution (fixed point) u ? =
0.887862 to have computed its value to 7 decimal places.
Remark : Inspection of the proof of Theorem 18.4 reveals that we never really used
the differentiability of g, except to verify the inequality
| g(u) g(v) | | u v |

for some fixed .

(18.18)

A function that satisfies (18.18) for all u nearby a given point v is called Lipschitz continuous, in honor of the 19th century German mathematician Rudolf Lipschitz. The Mean
Value Theorem C.3 implies that any continuously differentiable function g C 1 is automatically Lipschitz continuous, but there are nondifferentiable examples. The simplest is
the absolute value function g(u) = | u |, which is Lipschitz continuous, since

for any
u, v R,
| g(u) g(v) | = | u | | v | | u v |

but is not differentiable at u = 0. On the other hand, as its name indicates, Lipschitz
continuity does imply continuity. Thus, stability of the fixed point follows from the weaker
hypothesis that g(u) is Lipschitz continuous at u? with Lipschitz constant < 1.

Example 18.6. The simplest truly nonlinear example is a quadratic polynomial.


The most important case is the so-called logistic map
g(u) = u(1 u),

(18.19)

where 6= 0 is a fixed non-zero parameter. (The case = 0 is completely trivial. Why?)


In fact, an elementary change of variables can make any quadratic iterative system into
one involving a logistic map; see Exercise .
The fixed points of the logistic map are the solutions to the quadratic equation
u = u(1 u),

or

u2 u + 1 = 0.

Using the quadratic formula, we conclude that g(u) has two fixed points:
u?1 = 0,

u?2 = 1

1
.

Let us apply Theorem 18.4 to determine their stability. The derivative is


g 0 (u) = 2 u,
3/7/03

and so
793

g 0 (u?1 ) = ,

g 0 (u?2 ) = 2 .
c 2003

Peter J. Olver

Therefore, if | | < 1, the first fixed point is stable, while if 1 < < 3, the second fixed
point is stable. For < 1 or > 3 neither fixed point is stable, and we expect the
iterates to not converge at all.
Numerical experiments with this example show that it is the source of an amazingly
diverse range of behavior, depending upon the value of the parameter . In the following
table , we display the results of iteration starting with initial point u (0) = 1. As expected
from Theorem 18.4, the iterates converge to one of the fixed points in the range 1 < < 3,
except when = 1. For a little bit larger than 1 = 3, the iterates do not converge to a
fixed point; an example appears in the table . But it does not take long for them to settle
down and switch back and forth between two particular values. This behavior indicates
that there is a (stable) period 2 orbit for the discrete dynamical system, in accordance
with the following definition.
Definition 18.7. A period k orbit of a discrete dynamical system is a solution that
satisfies u(n+k) = u(n) for all n = 0, 1, 2, . . . . The (minimal ) period is the smallest positive
value of k for which this condition holds.
Thus, a fixed point
u(0) = u(1) = u(2) =
is a period 1 orbit. A period 2 orbit satisfies
u(0) = u(2) = u(4) =

and

u(1) = u(3) = u(5) = ,

but u(0) 6= u(1) , as otherwise the minimal period would be 1. Similarly, a period 3 orbit
has
u(0) = u(3) = u(6) = ,

u(1) = u(4) = u(7) = ,

u(2) = u(5) = u(8) = ,

with u(0) , u(1) , u(2) distinct. Stability implies that nearby iterates converge to this periodic
solution.
For the logistic map, the period 2 orbit persists until = 2 3.4495, after which
the iterates alternate between four values a period 4 orbit. This again changes at
= 3 3.5441, after which the iterates end up alternating between eight values. In fact,
there is an increasing sequence of values
3 = 1 < 2 < 3 < 4 < ,
where, for any n < n+1 , the iterates eventually follow a period 2n orbit. Thus, as
passes through each value n the period of the orbit doubles from 2n to 2 2n = 2n+1 ,
and the discrete dynamical system experiences a bifurcation. The bifurcation values n lie
closer and closer together, piling up on an eventual limit ? = lim n 3.5699, at which
n
point the period has become infinitely large. The entire phenomena is known as a period
doubling cascade. Interestingly, the ratios of the distances between successive bifurcation
points approaches a well-defined limit,
n+2 n+1
n+1 n
3/7/03

794

4.6692 . . . ,

(18.20)
c 2003

Peter J. Olver

known as Feigenbaums constant. In the 1970s, the American physicist Mitchell Feigenbaum, [43], discovered that this period doubling cascade appears in a broad range of
discrete dynamical systems. Even more remarkably, in all cases, the corresponding ratios
of distances between bifurcation points has the same limiting value. This was subsequently
proved by Oscar Lanford in 1982, [83].
After passes the limiting value ? , all hell breaks loose. The iterates become
completely chaotic , moving at random over the interval [ 0, 1 ]. But this is not the end of
the story. Embedded within this chaotic regime are certain small ranges of where the
system settles down to a stable orbit, whose period is not necessarily a power of 2. In
fact, there exist values of for which the iterates settle down to a stable orbit of period m
for any positive integer m. For instance, as increases past 3 ? 3.83, a period 3 orbit
appears for a while; then it experiences a succession of period doubling cascade of period
6, 12, 24, . . . orbits, each persisting on a shorter and shorter interval of parameter values,
until chaos breaks out yet again. There is a well-prescribed order in which the periodic
cases appear, and each period m is followed by a very closely spaced sequence of period
doubling bifurcations, of periods 2n m for n = 1, 2, 3, . . . , after which the iterates revert to
completely chaotic behavior until the next periodic case emerges. The ratios of distances
between bifurcation points have the same Feigenbaum limit (18.20). Finally, these periodic
and chaotic windows all pile up on the ultimate parameter value ?? = 4. And then, when
> 4, all the iterates go off to , and the system ceases to be interesting.
The reader is encouraged to write a simple computer program and perform some
numerical experiments. In particular, Figure log shows the asymptotic behavior of the
iterates for values of the parameter in the interesting range 2 < < 4. The horizontal axis
is , and the marked points show the ultimate fate of the iteration for the given value of
. For instance, the single curve lying above low values of represents a fixed point; this
bifurcates into a pair of curves representing a stable period 2 orbit, which then bifurcates
into 4 curves representing a period 4 orbit, and so on. Chaotic behavior is indicated by
a somewhat random pattern of points lying above the value of . To plot this figure, we
ran the iteration u(n) for 0 n 100, and then discarded the first 50 points, plotting the
next 50 iterates u(51) , . . . , u(100) . Investigation of the fine detailed structure of the logistic
map requires yet more iterations with increased accuracy. In addition one should discard
more of the initial iterates so as to give the system enough time to settle down to a stable
periodic orbit or continue in a chaotic manner.
Remark : So far, we have only looked at real scalar iterative systems. Complex discrete
dynamical systems dislay yet more remarkable and fascinating behavior. The complex
version of the logistic iteration equation leads to the justly famous Mandelbrot set, [84],
with its stunning, psychedelic fractal structure, [100].
The rich range of phenomena in evidence even in such extremely simple nonlinear
iterative systems is astounding. While intimations of this first appeared in the late nineteenth century research of the influential French mathematician Henri Poincare, serious

The term chaotic does have a precise mathematical definition, but the reader can take it
more figuratively for the purposes of this elementary introduction.

3/7/03

795

c 2003

Peter J. Olver

investigations were delayed until the advent of the computer era, which precipitated an
explosion of research activity in the area of dynamical systems. Similar period doubling
cascades and chaos can be found in a broad range of nonlinear systems, [7], and are often
encountered in physical applications, [89]. A modern explanation of fluid turbulence is
that it is a (very complicated) form of chaos.
Quadratic Convergence
Let us now return to the more mundane case when the iterates coverge to a stable fixed
point of the discrete dynamical system. In applications, we are interested in computing
a precise numerical value for the fixed point, and hence the speed of convergence of the
iterates is of crucial importance.
According to Theorem 18.4, the convergence rate of an iterative system is essentially
governed by the magnitude of the derivative | g 0 (u? ) | at the fixed point. The basic error
inequality (18.14), namely
| e(k+1) | | e(k) |,
is known as a linear convergence estimate. It means that the error decreases by a factor
of at least at each step. If the k th iterate u(k) approximates the fixed point u? correctly
to m decimal places, so | e(k) | < .5 10 m , then the (k + 1)st iterate satisfies
| e(k+1) | < .5 10 m = .5 10 m+log10 .
More generally, for any j > 0,
| e(k+j) | < .5 10 m j = .5 10 m+j log10 ,
which means that the (k + j)th iterate u(k+j) has at least
m j log10 = m + j log10 1
correct decimal places. For instance, if = .1 then each new iterate produces one new
decimal place of accuracy (at least), while if = .9 then it typically takes 22 1/ log 10 .9
iterates to produce just one additional accurate digit!
As a consequence, there is a huge advantage particularly in the application of
iterative methods to the numerical solution of equations to arranging that | g 0 (u? ) | be
as small as possible. The fastest convergence rate of all will occur when g 0 (u? ) = 0. Now
the constant in (18.14) can be taken to be arbitrarily small, although the smaller is, the
smaller the interval | v u? | < on which (18.14) applies, and so the closer one must be
to the fixed point. Be that as it may, once the iterates start converging, they will get closer
and closer to the fixed point, and so the rate of convergence will speed up accordingly. In
fact, for such functions, the rate of convergence is not just slightly, but dramatically faster
than linear.

The degree of precision is to be specified by the user and the application.

Note that since < 1, the logarithm log 10 1 = log10 > 0 is positive.

3/7/03

796

c 2003

Peter J. Olver

Theorem 18.8. Let g(u) C2 . Suppose u? = g(u? ) is a fixed point such that
g 0 (u? ) = 0. Then, for all iterates u(k) sufficiently close to u? , the errors e(k) = | u(k) u? |
satisfy the quadratic convergence estimate
for some constant > 0.

| e(k+1) | | e(k) |2

(18.21)

Proof : Just as in the proof of the linear convergence estimate (18.14), the proof relies
on approximating the function by a simpler function near the fixed point. For linear
convergence, an affine approximation sufficed, but in this case we require a higher order,
quadratic approximation. Instead of the mean value formula (18.12), we now use the first
order Taylor expansion (C.6) of g near u? :
g(u) = g(u? ) + g 0 (u? ) (u u? ) + 12 g 00 (w) (u u? )2 ,

(18.22)

where the error term depends on an (unknown) point w that lies between u and u ? . At a
fixed point, the constant term is g(u? ) = u? . Furthermore, under our hypothesis g 0 (u? ) = 0,
and so the Taylor expansion (18.22) reduces to
g(u) u? =

Therefore,
where is chosen so that

1
2

g 00 (w) (u u? )2 .

| g(u) u? | | u u? |2 ,
1
2

(18.23)

| g 00 (w) |

(18.24)

for all w sufficiently close to u? . Therefore, the magnitude of is governed by the size of
the second derivative of the iterative function g(u) near the fixed point. We apply (18.23)
to estimate the error
| e(k+1) | = | u(k+1) u? | = | g(u(k) ) g(u? ) | | u(k) u? |2 = | e(k) |2 ,

which establishes the quadratic convergence estimate (18.21).

Q.E.D.

Let us see how the quadratic estimate (18.21) speeds up the convergence rate. Following our earlier argument, suppose u(k) is correct to m decimal places, so
Then (18.21) implies that

| e(k) | < .5 10 m .

| e(k+1) | < .5 (10 m )2 = .5 10 2 m+log10 ,

and so u(k+1) has 2 m log10 accurate decimal places. If g 00 (u? ) is of moderate


size, we essentially double the number of accurate decimal places in just a single iterate! A
second iteration will double the number of accurate digits yet again. Thus, the convergence
of a quadratic iteration scheme is extremely rapid, and, barring round-off errors, one can
produce any desired number of digits of accuracy in a very short time. For example, if we
start with an initial guess that is accurate in the first decimal digit, then a linear iteration
with = .1 will require 49 iterations to obtain 50 decimal place accuracy, whereas a
quadratic iteration (with = 1) will only require 6 iterations to obtain 2 6 = 64 decimal
places of accuracy!
3/7/03

797

c 2003

Peter J. Olver

Example 18.9. Consider the function


4 u3 + 2 u 1
.
3 u2 + 1
There is a unique fixed point u? = g(u? ) which is the solution to the cubic equation
g(u) =

u3 + u 1 = 0.
Note that
g 0 (u) =

6 u (u3 + u 1)
6 u4 + 6 u 2 6 u
=
,
(3 u2 + 1)2
(3 u2 + 1)2

and hence g 0 (u? ) vanishes at the fixed point. Theorem 18.8 implies that the iterations
should exhibit quadratic convergence to the root. Indeed, we find, starting with u (0) = 0,
the following values. Not the dramatically faster convergence, espeically when contrasted
with the linearly convergent scheme based on
For a general discrete dynamical system, the appearance of a quadratically convergent
fixed point is a matter of luck. The construction of general purpose quadratically convergent iterative methods for solving equations will be the focus of the following Section 18.2.
VectorValued Iteration
Extending the preceding analysis to vector-valued iterative systems is not especially
difficult. We will build on our experience with linear iterative systems, and so the reader
should review the basic concepts and results from Chapter 9 before proceeding to the
nonlinear cases presented here.
We begin by fixing a norm k k on R n . Since we will also be computing the associated
matrix norm k A k, as defined in Theorem 9.16, it may be computationally more convenient
to adopt either the 1 or the norms rather than the standard Euclidean norm. As far as
the theory goes, however, the precise choice of norm is unimportant.
We begin by defining the vector-valued counterpart of the basic linear convergence
condition (18.18).
Definition 18.10. A function g: R n R n is Lipschitz continuous at a point a R n
if there exists a constant 0, known as the Lipschitz constant, such that
k g(u) g(a) k k u a k

(18.25)

for all u sufficiently close to a, i.e., k u a k < for some fixed > 0.

|u v |
, defined for
Example 18.11. Consider the function g(u) =
max | u |, | v |
T
u = ( u, v ) R 2 . Although g is not differentiable, it does satisfy the Lipschitz estimate
(18.25) for the 1 norm k u k1 = | u | + | v |. Indeed,

k g(u) g(a) k | u v | | a b | + max | u |, | v | max | a |, | b |

2 | u a | + | v b | = 2 k u a k1 .

Thus, (18.25) holds with uniform Lipschitz constant = 2.


3/7/03

798

c 2003

Peter J. Olver

Remark : The notion of Lipschitz continuity appears to depend on the underlying


choice of matrix norm. However, the fact that all norms on a finite-dimensional vector
space are essentially equivalent see Theorem 3.19 implies that this concept is, in fact,
independent of the choice of norm. However, one should keep in mind that the value of
the Lipschitz constant is norm-dependent.
The Lipschitz inequality (18.25) provides an immediate proof of the basic convergence
theorem for iteration of a discrete dynamical system (18.1). Recall that a fixed point is
called asymptotically stable if u(k) u? for every initial condition u(0) = c sufficiently
close to u? .
Theorem 18.12. If u? = g(u? ) is a fixed point for the discrete dynamical system
(18.1) and g is Lipschitz continuous at u? with Lipschitz constant < 1, then u? is an
asymptotically stable fixed point.
Proof : The proof is a copy of the last part of the proof of Theorem 18.4. We write
k u(k+1) u? k = k g(u(k) ) g(u? ) k k u(k) u? k,
using the assumed Lipschitz estimate (18.25). Iterating this basic inequality immediately
demonstrates that
k u(k) u? k k k u(0) u? k

for

k = 0, 1, 2, 3, . . . .

Since < 1, the right hand side tends to 0 as k , and hence u(k) u? .

Q.E.D.

For more complicated functions, the direct verification of the Lipschitz inequality
(18.25) is not particularly easy. However, as in the scalar case, any continuously differentiable function is automatically Lipschitz continuous.
Theorem 18.13. If g(u) C1 has continuous first order partial derivatives for all
u sufficiently close to u? , then g is Lipschitz continuous at u? .
Proof : According to the first order Taylor expansion (C.10) of a vector-valued function
at a point u? takes the form
g(u) = g(u? ) + g 0 (u? ) (u u? ) + R(u u? ).
Here

3/7/03

g1
u1

g2

u
1

g 0 (u) =
.
..

n
u1

g1
u2

...

g2
u2

...

..
.

..

gn
u2

...

799

(18.26)

g1
un

g2

un

,
..
.

gn

un

(18.27)

c 2003

Peter J. Olver

is the n n Jacobian matrix of the vector-valued function g whose entries are the partial
derivatives of its individual components. The remainder term in (18.26) satisfies
k R(v) k k v k2

whenever

k v k ,

for some positive constant > 0. If the corresponding matrix norm of the Jacobian matrix
at u? satisfies
k g 0 (u? ) k = ? ,
then, by the triangle inequality and the definition (9.23) of matrix norm,
k g(u) g(u? ) k = k g 0 (u? ) (u u? ) + R(u u? ) k k g 0 (u? ) (u u? ) k + k R(u u? ) k
k g 0 (u? ) k k u u? k + k u u? k2 (? + ) k u u? k,
(18.28)
whenever k u u? k . This proves that g is Lipschitz continuous at u? with Lipschitz
constant = ? + . Note that, by choosing small enough, we can ensure that the
Lipschitz constant is arbitrarily close to the matrix norm ? .
Q.E.D.
For a continuously differentiable function, then, asymptotic stability is a consequence
of the size, or, more correctly, the spectral radius of the Jacobian matrix at the fixed point.
Theorem 18.14. Supppose g(u) C2 . If u? = g(u? ) is a fixed point such that
g 0 (u? ) is a convergent matrix, then u? is asymptotically stable. The rate of convergence
of the iterative scheme u(k+1) = g(u(k) ) to u? is governed by the spectral radius of g 0 (u? ).
Proof : If g 0 (u? ) is convergent, and hence has spectral radius strictly less than 1, then
Corollary 9.28 assures us that there exists a matrix norm such that
k g 0 (u? ) k = ? < 1.

(18.29)

Defining as in the proof of Theorem 18.13, we then choose > 0 so that


= ? + < 1.
Then (18.28) implies that
k g(u) g(u? ) k k u u? k,

provided

k u u? k < .

As before, this suffices to proves convergence of the iterates to u? .

Q.E.D.

Example 18.15.
Theorem 18.14 tells us that initial values u(0) that are sufficiently near a stable fixed
point u? are guaranteed to converge to it. In the linear case, closeness of the initial data
to the fixed point was not, in fact, an issue; all stable fixed points are, in fact, globally
stable. For nonlinear iteration, it is of critical importance, and one does not typically
expect iteration starting with far away initial data to converge to the desired fixed point.

We can use any convenient norm on R n .

3/7/03

800

c 2003

Peter J. Olver

An interesting (and difficult) problem is to determine the so-called basin of attraction of


a stable fixed point, defined as the set of all initial data that ends up converging to it. As
in the elementary logistic map (18.19), initial values that lie ouside a basin of attraction
can lead to divergent iterates, periodic orbits, or even exhibit chaotic behavior. The full
range of possible phenomena is a subject of contemporary research in dynamical systems
theory and in numerical analysis, [7].
The smaller the spectral radius or matrix norm of the Jacobian matrix at the fixed
point, the faster the iterates converge to it. As in the scalar case, quadratic convergence will
occur when the Jacobian matrix g 0 (u? ) = O is the zero matrix , i.e., all first order partial
derivatives of the components of g vanish at the fixed point. The quadratic convergence
estimate
k u(k+1) u? k k u(k) u? k2
(18.30)
is a consequence of the second order Taylor expansion at the fixed point. Details of the
proof are left as an exercise.
Example 18.16.
In general, the existence of a fixed point of an iterative system is not automatic. One
way is to observe the iterates starting with suitably selected initial data; if they converge,
then Proposition 18.2 assures us that their limit is a fixed point. There is one important
class of maps for which we have a theoretical justification, not only of the existence, but
also the uniqueness of a fixed point.
Definition 18.17. A map g: is called a contraction mapping if it has Lipschitz
constant < 1 at all points in .
Therefore, applying a contraction mapping reduces the distance between points. As
a result, a contraction mapping shrinks the size of its domain; see Figure contract . As
a result, as the iterations proceed, the domain gets smaller and smaller and the iterates
become trapped. If the original domain is closed and bounded, then it is forced to shrink
down to a single point, which is the unique fixed point of the iterative system.
The simplest example of a contraction mapping
is the

scaling map g(u) = u with


0 < < 1. Starting with the unit ball B1 = k u k 1 , at the k th iteration the points
have been mapped into a contracted sphere of radius k . As k these contracted
domains become smaller and smaller, converging in the limit to the unique fixed point
u? = 0. A precise statement of the Contraction Mapping Theorem follows; see [map] for
the proof.
Theorem 18.18. If g: is a contraction mapping defined on a closed bounded
domain R n then g admits a unique fixed point u? . Moreover, starting with any
initial point u(0) , the iterates necessarily converge to the fixed point u(k) u? .
More sophisticated, powerful fixed point theorems require advanced knowledge of
algebraic topology and will not be developed in this text. See [fixed] for details.

Having zero spectral radius is not sufficient for quadratic convergence; see Exercise .

3/7/03

801

c 2003

Peter J. Olver

18.2. Solution of Equations and Systems.


The solution of nonlinear equations and systems of equations is, of course, a problem of
utmost importance in mathematics and its manifold applications. In the general situation,
we are given a collection of m functions depending upon n variables, and we are interested
T
in finding all solutions u = ( u1 , u2 , . . . , un ) to the system
f1 (u1 , . . . , un ) = 0,

...

fm (u1 , . . . , un ) = 0.

(18.31)

In practice, as in the linear case, we are primarily interested in the case when the number
of equations is equal to the number of unknowns, m = n, as one can only expect both
existence and uniqueness of solutions in such situations. This point will be discussed in
further detail below.
There is no universal direct solution method for nonlinear equations and systems
comparable to Gaussian elimination. As a result, numerical solution techniques rely almost
exclusively on iterative algorithms. In this section, we shall present the principal methods
for numerically approximating the solution(s) to a system. We shall only discuss general
purpose algorithms. Specialized methods for particular classes of equations, e.g., methods
designed for solving polynomial equations, can be found in numerical analysis texts, e.g.,
[23, 101]. Of course, the most important specialized methods those designed for solving
linear systems will continue to play a critical role, even in the nonlinear regime.
The Bisection Method
We begin, as always, with the scalar case. Thus, we are given a real-valued function
f : R R, and seek its roots, i.e., the real solution(s) to the scalar equation
f (u) = 0.

(18.32)

Here are some prototypical examples:


(a) Find the roots of the quintic polynomial equation
u5 + u + 1 = 0.

(18.33)

Graphing the left hand side of the equation, as in Figure u5 , convinces us that there is
just one real root, lying somewhere between 1 and .5. While there are explicit algebraic
formulas for the roots of quadratic, cubic, and quartic polynomials, a famous theorem due
to the Norwegian mathematician Nils Henrik Abel in the early 1800s states that there is
no such formula for generic fifth order polynomial equations.
(b) As noted in Example 18.5, the trigonometric Kepler equation
u sin u = m

Complex roots to complex equations will be discussed later.

A modern proof of this fact relies on Galois theory, [ 52 ].

3/7/03

802

c 2003

Peter J. Olver

arises in the study of planetary motion. Here , m are fixed constants, and we seek a
corresponding solution u. We have already looked at one iterative solution method for this
equation.
(c) Chemistry
The most primitive method for solving scalar equations, and the only one that is
guaranteed to work in all cases, is the bisection algorithm. While it has an iterative flavor,
it cannot be properly classed as a method governed by functional iteration as defined in
the preceding section, and so must be studied directly in its own right.
The starting point is the Intermediate Value Theorem, which we state in simplified
form, without proof. See Figure ivt for an illustration, and [9] for a proof.
Theorem 18.19. Let f (u) be a continuous scalar function. Suppose we can find
two points a < b where the values of f (a) and f (b) take opposite signs, so either f (a) < 0
and f (b) > 0, or f (a) > 0 and f (b) < 0. Then there exists at least one point a < u ? < b
where f (u? ) = 0.
The hypothesis can be compactly written as f (a) f (b) < 0. Note that if f (a) = 0 or
f (b) = 0, then finding a root is trivial. If f (a) and f (b) have the same sign, then there
may or may not be a root in between. Figure roots plots the functions u 2 + 1, u2 and
u2 1, on the interval 2 u 2. The first has two simple roots; the second has a single
double root, while the third has no root. Also, continuity of the function on the entire
interval [ a, b ] is an essential hypothesis. For example, the function f (u) = 1/u satisfies
f (1) = 1 and f (1) = 1, but there is no root to the equation 1/u = 0.
Note carefully that the Theorem 18.19 does not say there is a unique root between
a and b. There may be many roots (even, in pathological examples, infinitely many), as
illustrated in Figure root3 . All the theorem guarantees is that there is at least one root.
Once we are assured that a root exists, bisection amounts to a divide and conquer
strategy. Starting with the endpoints, the goal is to locate a root a < u ? < b between
them. Lacking any additional evidence, a good strategy would be to try the midpoint
c = 12 (a + b) as a first guess for the root. If, by some miracle, f (c) = 0, then we are done,
since we have found a solution! Otherwise (and typically) we look at the sign of f (c).
There are two possibilities. If f (a) and f (c) are of opposite signs, then the Intermediate
Value Theorem tells us that there is a root u? lying between a < u? < c. Otherwise, f (c)
and f (b) must have opposite signs, and so there is a root c < u? < b. In either event, we
apply the same method to the interval in which we are assured a root lies, and repeat the
procedure. Each iteration halves the length of the interval, and chooses the half in which
a root must be. (There may, of course, be a root in the other half, but we cannot be sure
of this, and so discard it from further consideration.) The root we home in on lies trapped
in intervals of smaller and smaller width, and so convergence of the method is guaranteed.
Figure bisect illustrates the steps in a particular example.
Example 18.20. The roots of the quadratic equation
f (u) = u2 + u 3 = 0
3/7/03

803

c 2003

Peter J. Olver

u(k)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14

1
1
1.25
1.25
1.25
1.2813
1.2969
1.2969
1.3008
1.3027
1.3027
1.3027
1.3027
1.3027
1.3027

v (k)

w(k) = 21 (u(k) + v (k) )

f (w(k) )

2
1.5
1.5
1.375
1.3125
1.3125
1.3125
1.3047
1.3047
1.3047
1.3037
1.3032
1.3030
1.3029
1.3028

1.5
1.25
1.375
1.3125
1.2813
1.2969
1.3047
1.3008
1.3027
1.3037
1.3032
1.3030
1.3029
1.3028
1.3028

.75
.1875
.2656
.0352
.0771
.0212
.0069
.0072
.0002
.0034
.0016
.0007
.0003
.0001
.0000

are given by the quadratic formula

1
+
13
u?1 =
1.302775 . . . ,
2

13
u?2 =
2.302775 . . . .
2

Let us see how one might approximate them by applying the Bisection Algorithm. We start
the procedure by choosing the points a = u(0) = 1, b = v (0) = 2, noting that f (1) = 1
and f (2) = 3 have opposite signs and hence we are guaranteed that there is at least one
root between 1 and 2. In the first step we look at the midpoint of the interval [ 1, 2 ],
which is 1.5, and evaluate f (1.5) = .75. Since f (1) = 1 and f (1.5) = .75 have opposite
signs, we know that there is a root lying between 1 and 1.5. Thus, we use u (1) = 1 and
v (1) = 1.5 as the endpoints of the next interval, and continue. The next midpoint is at
1.25, where f (1.25) = .1875 has the opposite sign to f (1.5) = .75, and so a root lies
between u(2) = 1.25 and v (2) = 1.5. The process is then iterated as long as desired or,
more practically, as long as your computers precision does not become an issue.
The accompanying table displays the result of the algorithm, rounded off to four
decimal digits. Thus, after 14 iterations the Bisection Algorithm has computed the positive
root u?1 correctly to 4 decimal places. A similar bisection starting with the interval from
u(1) = 3 to v (1) = 2 will produce the negative root.
The formal implementation of the algorithm is governed by the following program.
The
endpoints of the k th interval are denoted by u(k) and v (k) . The midpoint is w (k) =

1
(k)
+ v (k) , and the main decision is whether w (k) should be the right or left hand
2 u
3/7/03

804

c 2003

Peter J. Olver

endpoint of the new interval. The integer n, governing the number of iterations, will be
prescribed in accordance with how close we wish to approximate the solution u ? .

The Bisection Method


start
if f (a) f (b) < 0 set u(0) = a, v (0) = b
for k = 0 to n 1
set w (k) = 21 (u(k) + v (k) )
if f (w(k) ) = 0, stop; print u? = w(k)
if f (w(k) ) f (u(k) ) < 0, set u(k+1) = w(k) , v (k+1) = v (k)
else set u(k+1) = u(k) , v (k+1) = w(k)
next k
print u? = w(n) = 21 (u(n) + v (n) )
end

The algorithm produces two sequences of approximations such that u (k) < u? < v (k)
lies between them. Both converge monotonically to the root, one from below and the other
from above:
a = u(0) u(1) u(2) u(k) u? v (k) v (2) v (1) v (0) = b.

In other words, the solution u? is trapped inside a sequence of intervals [ u(k) , v (k) ] of
progressively shorter and shorter length. Since we cut the interval in half at each step of
the algorithm, the length of the interval [ u(k) , v (k) ] is exactly half that of [ u(k1) , v (k1) ],
and so
v (k) u(k) = 12 (v (k1) u(k1) ).
Iterating this formula, we conclude that
n
n
v (n) u(n) = 12 (v (0) u(0) ) = 21 (b a) .
The final approximation

w(n) = 21 (u(n) + v (n) )


lies in middle of its interval, and hence must be within a distance
n+1
| w(n) u? | 12 (v (n) u(n) ) = 12
(b a)

of the root. Consequently, if we want to approximate the root within a prescribed tolerance
, we should choose the number of iterations n so that
1 n+1
2

3/7/03

(b a) < ,

or

805

n > log2

ba
1.

c 2003

(18.34)

Peter J. Olver

Theorem 18.21. If f (u) is a continuous function, with f (a) f (b) < 0, then the
bisection algorithm starting with u(0) = a, v (0) = b will converge to a solution to f (u) = 0
lying between a and b. After n steps, the midpoint w (n) = 12 (u(n) + v (n) ) will be within a
tolerance of = 2 n1 (b a) of the solution.
For example, in the case of the quadratic equation in Example 18.20, after 14 iterations, we have approximated the positive root to within
15
(2 1) 3.052 105 ,
= 12

reconfirming our observation that we have accurately computed the first four decimal
places of the root. If we need 10 decimal places, we set our tolerance to = 10 11 , and so,
according to (18.34), must perform n = 36 > 35.54 log 2 1011 1 successive bisections.
Example 18.22. As noted at the beginning of this section, the quintic equation
f (u) = u5 + u + 1 = 0

has one real root, whose value can be readily computed by bisection. We start the algorithm
with the initial points u(0) = 1, v (0) = .5, noting that f ( 1) = 1 < 0 while f (0) =
1 > 0 are of opposite signs. In order to compute the root to 6 decimal places, we set
= 107 in (18.34), and so need to perform n = 23 > 22.25 log 2 107 1 bisections.
Indeed, the algorithm produces the approximation u? 0.754878 to the root, and the
displayed digits are guaranteed to be accurate.
Fixed Point Methods
The Bisection method converges in all cases provided it can be properly started
by locating two points where the function takes opposite signs. This may be tricky if
the function has two very closely spaced roots and is, say, negative only for a very small
interval between them, and may be impossible for multiple roots, e.g., the root u ? = 0
of the quadratic function f (u) = u2 . When applicable, its convergence rate is completely
predictable, but not especially fast. Worse, it has no immediately apparent extension to
systems of equations, since there is no counterpart to the Intermediate Value Theorem for
vector-valued functions.
Most other methods for solving equations rely on some form of fixed point iteration.
Thus, we seek to replace the system of equations (18.32) with a fixed point system
u = g(u).

(18.35)

The key requirements are


(a) The solution u? to (18.32) is also a fixed point for equation (18.35), and
(b) u? is, in fact a stable fixed point, so the Jacobian matrix g 0 (u? ) is a convergent
matrix, or, slightly more restrictively, k g 0 (u? ) k < 1 for a given matrix norm.
If both requirements are satisfied, then, provided we choose the initial iterate u (0) = c
sufficiently close to u? , the iterates u(k) will converge to the desired solution u? as k .
Thus, the key to the practical use of functional iteration for solving equations is the proper
design of an iterative system, coupled with a reasonably good initial guess for the solution.
3/7/03

806

c 2003

Peter J. Olver

Example 18.23. To solve the cubic equation


f (u) = u3 u 1 = 0

(18.36)

we note that f (1) = 1 while f (2) = 5, and so there is a root between 1 and 2. Indeed,
the bisection algorithm gives the approximate value u? 1.3247 after 17 iterations.
Let us try to find the same root by fixed point iteration. As a first, nave, guess, we
rewrite the cubic equation in fixed point form
u = 1 u3 = ge(u).

Starting with the initial guess u(0) = 1.5, the successive iterates are given by
u(k+1) = ge(u(k) ) = 1 (u(k) )3 ,

k = 0, 1, 2, . . . .

However, their values


u(0) = 1.5,

u(3) = 2, 983,

u(1) = 2.375,

u(2) = 14.3965,

u(4) = 2.654 1010 ,

u(5) = 1.869 1031 ,

...

rapidly become unbounded and fail to converge. This could have been predicted by the
convergence criterion in Theorem 18.4. Indeed, ge 0 (u) = 3 u2 and so | ge 0 (u) | > 3 for all
1 u, including the root u? . This means that u? is an unstable fixed point, and we cannot
expect the iterates to converge to it.
On the other hand, we can rewrite the equation (18.36) in the alternative iterative
form

u = 3 1 + u = g(u).
In this case
0 g 0 (u) =

1
1

2/3
3
3(1 + u)

for

u > 0.

Thus, the stability condition (18.13) is satisfied, and we anticipate convergence at a rate
of at least 31 . (The bisection method converges more slowly, at rate 12 .) Indeed, the first

3
few iterates u(k+1) = 1 + u(k) are
1.5,

1.3571,

1.33086,

1.32588,

1.32494,

1.32476,

1.32473,

and we have converged to the root, correct to four decimal places, in only 6 iterations.
Newtons Method
As we learned in Section 18.1, the speed of convergence of an iterative method based
on a scalar function g(u) is governed by the magnitude of its derivative, | g 0 (u? ) |, at the
fixed point. Thus, to design an iterative method to solve an equation f (u) = 0, we need
(a) a function g(u) whose fixed points u? coincide with the solutions,
(b) whose derivative at the fixed point is as small as possible.
In particular, if we can arrange that g 0 (u? ) = 0, then, instead of a relatively slow linear convergence rate, the numerical solution method will satisfy the dramatically faster
quadratic convergence estimate of Theorem 18.8, with all its consequent advantages.
3/7/03

807

c 2003

Peter J. Olver

Now, the first condition requires that g(u) = u whenever f (u) = 0. A little thought
will convince you that the iterative function should take the form
g(u) = u (u) f (u),

(18.37)

where (u) is a reasonably nice function. If f (u? ) = 0, then clearly u? = g(u? ), and so u?
is a fixed point. The converse holds provided (u) 6= 0 is never zero.
For a quadratically convergent method, the second requirement is that the derivative
of g(u) be zero at the fixed point solutions. We compute
g 0 (u) = 1 0 (u) f (u) (u) f 0 (u).
Thus, g 0 (u? ) = 0 at a solution to f (u? ) = 0 if and only if
0 = 1 0 (u? ) f (u? ) (u? ) f 0 (u? ) = 1 (u? ) f 0 (u? ).
Consequently, we should require that
(u? ) =

1
f 0 (u? )

(18.38)

to ensure a quadratically convergent iterative scheme. This assumes that f 0 (u? ) 6= 0, which
means that u? is a simple root of f . We leave aside multiple roots, which require a different
argument and method, to be outlined in Exercise .
Of course, there are many functions (u) that satisfy (18.38), since we only need to
specify its value at a single point. The problem is that we do not know u ? after all this
is what we are trying to compute and so cannot compute the value of the derivative
of f there. However, we can circumvent this apparent difficulty by a simple device: we
impose equation (18.38) at all points,
(u) =

1
f 0 (u)

(18.39)

which certainly guarantees that it hold at the solution u? . The result is the function
g(u) = u

f (u)
,
f 0 (u)

(18.40)

that yields the iteration scheme known as Newtons method . It dates back to Isaac Newton,
the founder of the calculus, and, to this day, remains the most important general purpose
algorithm for solving equations. Newtons method starts with an initial guess u (0) to be
supplied by the user, and then successively computes
u(k+1) = u(k)

f (u(k) )
.
f 0 (u(k) )

(18.41)

Provided the initial guess is sufficiently close, the iterates u(k) are guaranteed to converge
to the (simple) root u? of f .
3/7/03

808

c 2003

Peter J. Olver

Theorem 18.24. Suppose f (u) C2 is twice continuously differentiable. Let u?


be a solution to the equation f (u) = 0 such that f 0 (u? ) 6= 0. Given an initial guess u(0)
sufficiently close to u? , the Newton iteration scheme (18.41) converges at a quadratic rate
to the solution u? .
Proof : By continuity, if f 0 (u? ) 6= 0, then f 0 (u) 6= 0, and hence the Newton iterative
function (18.40) is well defined and continuously differentiable for all u sufficiently close to
u? . Since g 0 (u) = f (u) f 00 (u)/f 0 (u)2 , we have g 0 (u? ) = 0, as promised by our construction.
Hence, the result is an immediate consequence of Theorem 18.8.
Q.E.D.
Example 18.25. Consider the cubic equation
f (u) = u3 u 1 = 0,
that we already solved in Example 18.23. The function used in the Newton iteration is
g(u) = u
which is well-defined as long as u 6=
iterative procedure

f (u)
u3 u 1
=
u

,
f 0 (u)
3 u2 1

1
3

. We will try to avoid these singular points. The

u(k+1) = g(u(k) ) = u(k)

(u(k) )3 u(k) 1
3 (u(k) )2 1

with initial guess u(0) = 1.5 produces the following values:


1.5,

1.34783,

1.32520,

1.32472,

which gives the root correctly to 5 decimal places after only three iterations. The quadratic convergence of Newtons method implies that, roughly, each new iterate doubles the
number of correct decimal places. Thus, if we need to compute the root accurately to 40
decimal places , it would only require 3 further iterations! this underscores the tremendous
advantage that the Newton algorithm offers over competing methods such as bisection or
nave iteration.
Example 18.26. The cubic polynomial equation
f (u) = u3 23 u2 + 95 u

1
27

=0

has
1
f (0) = 27
,

1
3

1
54

2
3

1
= 27
,

f (1) =

1
54

The Intermediate Value Theorem 18.19 guarantees that there are three roots on the interval
[ 0, 1 ]: one between 0 and 13 , the second between 13 and 23 , and the third between 23 and 1.

This assumes we are working in a sufficiently high precision arithmetic so as to avoid round-off
errors.

3/7/03

809

c 2003

Peter J. Olver

The graph in Figure cubic reconfirms this observation. Since we are dealing with a cubic
polynomial, there are no other roots.
sixteen
iterations
2 of the bisection algorithm starting with the three subinterIt takes
1 2
1
vals 0 , 3 , 3 , 3 and 3 , 1 to produce the roots to six decimal places:
u?1 .085119,

u?2 .451805,

u?3 .963076.

Incidentally, if we start with the interval [ 0, 1 ] and apply bisection, we converge (perhaps
surprisingly) to the largest root u?3 in 17 iterations.
Fixed point iteration based on the formulation
u = g(u) = u3 + 23 u2 + 49 u +

1
27

can be used to find the first and third roots, but not the second root. For instance, starting
with u(0) = 0 produces u?1 to 5 decimal places after 23 iterations, whereas starting with
u(0) = 1 produces u?3 to 5 decimal places after 14 iterations. The reason we cannot produce
u?2 is due to the magnitude of the derivative
g 0 (u) = 3 u2 + 3 u +

4
9

at the roots, which is


g 0 (u?1 ) 0.678065,

g 0 (u?2 ) 1.18748,

g 0 (u?3 ) 0.551126.

Thus, u?1 and u?3 are stable fixed points, but u?2 is unstable. However, because g 0 (u?1 ) and
g 0 (u?3 ) are both bigger than .5, this iterative algorithm converges slower than ordinary
bisection!
Finally, Newtons method is based upon iteration of the function
g(u) = u

u3 32 u2 + 59 u
f (u)
=
u

f 0 (u)
3 u2 3 u + 59

1
27

Starting with an initial guess of u(0) = 0, the method computes u?1 to 5 decimal places after
only 4 iterations; starting with u(0) = .5, it produces u?2 after 2 iterations; while starting
with u(0) = 1 produces u?3 after 3 iterations a dramatic speed up over the other two
methods.
Newtons method has a very pretty graphical interpretation, that helps us understand
what is going on and why it converges so fast. Given the equation f (u) = 0, suppose we
know an approximate value u = u(k) for a solution. Nearby u(k) , we can approximate the
nonlinear function f (u) by its tangent line at the given point u(k) , which has the equation
y = f (u(k) ) + f 0 (u(k) )(u u(k) ).

(18.42)

As long as the tangent line is not horizontal which requires f 0 (u(k) ) 6= 0 it crosses
the axis at the abscissa
f (u(k) )
u(k+1) = u(k) 0 (k) ,
f (u )
3/7/03

810

c 2003

Peter J. Olver

which represents a new, and, presumably more accurate, approximation to the desired
root. The procedure is illustrated pictorially in Figure Newton . Note that the passage
from u(k) to u(k+1) is exactly the Newton iteration step (18.41). In this manner, Newtons
method can be viewed as successive approximation of the function by its tangent line and
then using the root of the resulting affine function as the next approximation to the root
of the function.
Given sufficiently accurate initial guesses, Newtons method will then rapidly produce
accurate values for the simple roots to the equation in question. In practice, barring special
structure in the problem, Newtons method is the root-finding algorithm of choice. The
one caveat is that we need to come up with a reasonably close initial guess to the root we
are seeking. Otherwise, there is no guarantee that it will converge at all, although if the
Newton iterations do converge, we know that the limiting value is a root of our equation.
The behavior of Newtons method as we change parameters and vary the initial guess is
very similar to the logistic map, and includes period doubling bifurcations and chaotic
behavior. The reader is invited to experiment with simple examples, some of which are
provided in Exercise . For further details, see [100].
Example 18.27. For fixed values of the eccentricity , Keplers equation
u sin u = m

(18.43)

can be viewed as a implicit equation defining the eccentric anomaly u as a function of the
mean anomaly m. To solve the equation by Newtons method, we introduce the iterative
function
u sin u m
g(u) = u
.
1 cos u
Notice that when | | < 1, the denominator never vanishes and so the iteration remains
well-defined everywhere. Starting with an initial guess u(0) , we are assured that the method
will quickly converge to the solution.
Fixing the eccentricity , we can employ a continuation method to determine how the
solution u? = h(m) depends upon the mean anomaly m. Namely, we start at m = m0 = 0
with the obvious solution u? = h(0) = 0. Then, to compute the solution at successive
closely spaced values 0 < m1 < m2 < m3 < , we use the previously computed value as
an initial guess u(0) = h(mk ) for the value of the solution at the next mesh point mk+1 ,
and run the Newton scheme until it converges to the value u? = h(mk+1 ). As long as mk+1
is reasonably close to mk , Newtons method will converge to the solution quite quickly.
The continuation method will quickly produce the values of u at the sample points
mk . Intermediate values can either be determined by an interpolation scheme, e.g., a cubic
spline fit of the data, or by running the Newton scheme using the closest known value as
an initial condition. A plot for the value = .5 appears in Figure kepler .
Systems of Equations
Let us now turn our attention to systems of equations. We shall only consider the
case when there are the same number of equations as unknowns:
f1 (u1 , . . . , un ) = 0,
3/7/03

...
811

fn (u1 , . . . , un ) = 0.
c 2003

(18.44)
Peter J. Olver

We shall write the system (18.44) in vector form


f (u) = 0,

(18.45)

where f : R n R n is a vector-valued function of n variables. Also, we do not necessarily


require that f be defined on all of R n , although this does simplify the exposition to a
certain degree.
We shall only consider solutions that are isolated, meaning separated from all the
others. More formally:
Definition 18.28. A solution u? to a system f (u) = 0 is called isolated if there
exists > 0 such that f (u) 6= 0 for all u satisfying 0 < k u u? k < .
Example 18.29. Consider the planar equation
x2 + y 2 = (x2 + y 2 )2 .
Rewriting the equation in polar coordinates as
r = r2

or

r(r 1) = 0,

we immediately see that the solutions consist of the origin x = y = 0 and all points on the
unit circle r 2 = x2 + y 2 = 1. Only the origin is an isolated solution.
Typically, the solutions to a system of n euqations in n unknowns are isolated, although this is not always the case. For example, if A is a singular n n matrix, then the
solutions to A u = 0 consist of a nontrivial subspace of R n and so are not isolated. Nonlinear systems with non-isolated solutions can similarly be viewed as having some form of
degeneracy. In general, the computation of non-isolated solutions, e.g., solving the implicit
equations for a curve or suface, is a much more difficult problem, and we will not attempt
to discuss these issues in this introductory presentation. However, our continuation approach to the Kepler equation in Example 18.27 gives a hint as to how one might proceed
in such situations.
In the case of a single scalar equation, the simple roots are the most amenable to
practical computation. In higher dimensions, the role of the derivative of the function is
played by the Jacobian matrix (18.27), and this motivates the following definition.
Definition 18.30. A solution u? to a system f (u) = 0 is called nonsingular if the
associated Jacobian matrix is nonsingular there: det f 0 (u? ) 6= 0.
Note that the Jacobian matrix is square if and only if the system has the same number
of equations as unknowns, and so this is a requirement for a solution to be nonsingular.
Moreover, the Inverse Function Theorem, [9, 106], from multivariable calcuolus implies
that a nonsingular solution is necessarily isolated.
Theorem 18.31. If u? is a nonsingular solution to the system f (u) = 0, then u? is
an isolated solution.
3/7/03

812

c 2003

Peter J. Olver

As with simple roots of scalar equations, nonsingular solutions of systems are the most
amenable to practical computation. Non-isolated solutions, as well as isolated solutions
with singular Jacobian matrices, are much more difficult to compute, and very few useful
solution algorithms exist in such degenerate situations.
Now, let us turn to numerical solution techniques. The first remark is that, unlike
the scalar case, proving existence of a solution to a system of equations is often a difficult
problem. There is no counterpart to the Intermediate Value Theorem 18.19 for vectorvalued functions; it is easy to construct examples of vector-valued functions, whose entries
take on both positive and negative values, but for which there are no solutions to the
system (18.45). For this reason, there is no decent analog of the Bisection method for
systems of equations.
On the other hand, Newtons method can be straightforwardly adapted to compute
nonsingular solutions to systems of equations, and forms the most widely used method for
this purpose. The derivation proceeds in very similar manner to the scalar case. First, we
replace the system (18.45) by a fixed point system
u = g(u)

(18.46)

having the same solutions. By direct analogy with (18.37), any (reasonable) fixed point
method will take the form
g(u) = u L(u) f (u),
(18.47)
where L(u) is an n n matrix-valued function. Clearly, if f (u) = 0 then g(u) = u;
conversely, if g(u) = u, then L(u) f (u) = 0. If we further require that the matrix L(u)
be nonsingular, i.e., det L(u) 6= 0, then every fixed point of the iterator (18.47) will be a
solution to the system (18.45) and vice versa.
According to Theorem 18.14, the speed of convergence (if any) of the iterative method
u(k+1) = g(u(k) )

(18.48)

is governed by the spectral radius or matrix norm of the Jacobian matrix g 0 (u? ) at the
fixed point. In particular, if
g 0 (u? ) = O
(18.49)
is the zero matrix, then the method is quadratically convergent. Computing the derivative
using a vector version of the Leibniz rule, we find
g 0 (u? ) = I L(u? ) f 0 (u? ),

(18.50)

where I is the bnn identity matrix; see Exercise for details. (Fortunately, all the terms
that involve derivatives of the entries of L(u) go away since f (u? ) = 0 by assumption.)
Therefore, the quadratic convergence criterion (18.49) holds if and only if
L(u? ) f 0 (u? ) = I ,

and hence

L(u? ) = f 0 (u? )

(18.51)

should be the inverse of the Jacobian matrix of f at the solution, which, fortuituously, was
already assumed to be nonsingular.
3/7/03

813

c 2003

Peter J. Olver

As in the scalar case, we dont know the solution u? , but we can arrange that condition
(18.51) holds by setting
1
L(u) = f 0 (u)
everywhere or at least everywhere that f has a nonsingular Jacobian matrix. The
resulting fixed point system
u = g(u) = u f 0 (u)1 f (u),

(18.52)

leads to the quadratically convergent Newton iteration scheme


u(k+1) = u(k) f 0 (u(k) )1 f (u(k) ).

(18.53)

All it requires is that we guess an initial value u(0) that is sufficiently close to the desired
solution u? . We are then guaranteed that the iterates u(k) converge quadratically fast to
u? .
Theorem 18.32. Let u? be a nonsingular solution to the system f (u) = 0. Then,
provided u(0) is sufficiently close to u? , the Newton iteration scheme (18.53) converges at
a quadratic rate to the solution: u(k) u? .
Example 18.33. Consider the pair of simultaneous cubic equations
f1 (u, v) = u3 3 u v 2 1 = 0,

f2 (u, v) = 3 u2 v v 3 = 0.

It is not difficult to prove that there are three solutions:


1
.5
?
?
u1 =
,
u2 =
,
0
.866025 . . .

u?3

The Newton scheme relies on the Jacobian matrix


2

3 u 3 v2
6uv
0
f (u) =
.
6uv
3 u2 3 v 2

(18.54)

.5
.
.866025 . . .

Since det f 0 (u) = 9(u2 + v 2 ) is non-zero except at the origin, all three solutions are nonsingular, and hence, for a sufficiently close initial value, Newtons method will converge.
We compute the inverse Jacobian matrix explicitly:
2

1
3 u 3 v2
6uv
0
1
.
f (u) =
6uv
3 u2 3 v 2
9(u2 + v 2 )
Hence, in this particular example, the Newton iterator (18.52) is

2
3

1
u
3 u 3 v2
6uv
u 3 u v2 1
g(u) =
.

v
6uv
3 u2 3 v 2
3 u2 v v 3
9(u2 + v 2 )
Implementing (18.53). Starting with

we converge to

Remark : The alert reader may notice that in this example, we are in fact merely
computing the cube roots of unity, i.e., equations (18.54) are the real and imaginary parts
of the complex equation z 3 = 1 when z = u + i v. A complete map of the basins of
attraction coverging to the three different roots has a remarkably complicated, fractal-like
structure, as illustrated in Figure Newt3 .
3/7/03

814

c 2003

Peter J. Olver

Example 18.34. A robot arm consists of two rigid rods that are joined end-to-end
to a fixed point in the plane, which we take as the origin 0. The arms are free to rotate, and
the problem is to configure them so that the robots hand ends up at the prescribed position
T
a = ( a, b ) . The first rod has length ` and makes an angle with the horizontal, so its
T
end is at position v1 = ( ` cos , ` sin ) . The second rod has length m and makes an
T
angle with the horizontal, and so is represented by the vector v2 = ( m cos , m sin ) .
The hand at the end of the second arm is at position v1 + v2 , and the problem is to find
values for the angles , so that v1 + v2 = a. To this end, we need to solve the system of
equations
` cos + m cos = a,
` sin + m sin = b.
(18.55)
To compute the solution, we shall apply Newtons method. First, we compute the
Jacobian matrix of the system with respect to , , which is

` sin m sin
0
f (, ) =
.
` cos
m cos
As a result, the Newton iteration equation (18.53) has the explicit form
(k+1) (k)

(k+1)

(k)

1
` cos (k) m sin (k)
` cos (k) + m cos (k) a

.
` sin (k) + m sin (k) b
` m sin( (k) (k) ) ` cos (k) m sin (k)

when running the iteration, one must be careful to avoid points at which (k) (k) = 0
or , i.e., where the robot arm has straightened out.
As an example, let us assume that the rods have lengths ` = 2, m = 1, and the
T
desired location of the hand is at a = ( 1, 1 ) . We start with an initial guess of (0) = 0,
(0) = 12 , so the first rod lies along the xaxis and the second is perpendicular. The first
few Newton iterates are given in the accompanying table.The first column gives the iterate
number k. The second and third columns indicate the angles (k) , (k) of the rods. The

T
fourth and fifth give the position x(k) , y (k)
of the joint or elbow, while the final two
(k) (k) T
indicate the position z , w
of the robots hand.
Thus, the robot has raqpidly converged to one of the two possible configurations.
Convergence is dependent upon the initial configuration, and the iterates do not always
settle down. For instance, if k a k > ` + m, there is no possible solution, since the arms
are too short for the hand to reach to desired location; thus, no choice of initial conditions
will lead to a convergent scheme and the robot arm flaps around in a chaotic manner.
Now that we have gained some experience with Newtons method for systems of equations, some supplementary remarks are in order. As we learned back in Chapter 1, except
perhaps in very low-dimensional situations, one should not invert a matrix directly, but
rather use Gaussian elimination, or, in favorable situations, a linear iterative scheme, e.g.,
Jacobi, GaussSeidel or SOR, to solve a linear system. So it is better to write the Newton
equation (18.53) in unsolved, implicit form
f 0 (u(k) ) v(k+1) = f (u(k) ),
3/7/03

815

u(k+1) = u(k) + v(k) .


c 2003

(18.56)
Peter J. Olver

(k)

(k)

x(k)

y (k)

z (k)

w(k)

0
1
2
3
4
5

0.0000
0.0000
0.3533
0.2917
0.2987
0.2987

1.5708
2.5708
2.8642
2.7084
2.7176
2.7176

2.0000
2.0000
1.8765
1.9155
1.9114
1.9114

0.0000
0.0000
0.6920
0.5751
0.5886
0.5886

2.0000
1.1585
0.9147
1.0079
1.0000
1.0000

1.0000
0.5403
0.9658
0.9948
1.0000
1.0000

Given the iterate u(k) , we first compute the Jacobian matrix f 0 (u(k) ), and then use our
preferred linear systems solver to find v (k) . Adding u(k) to the result immediately yields
the updated approximation u(k+1) to the solution.
Therefore, the main bottleneck in the implementation of the Newton scheme, particularly for large systems, is solving the linear system (18.56). The coefficient matrix
f 0 (u(k) ) must be recomputed at each step of the iteration, and hence knowing the solution
to the k th linear system does not help us solve the next one in the sequence. Having to
re-implement a complete Gaussian elimination at every step will tend to slow down the
algorithm, particularly in high dimensional situations involving many equations in many
unknowns.
One simple dodge for speeding up the computation is to note that, once we start
converging, u(k) will be very close to u(k1) and so we will probably not go far wrong by
using f 0 (u(k1) ) in place of the updated Jacobian matrix f 0 (u(k) ). Since we have already
solved the linear system with coefficient matrix f 0 (u(k1) ), we know its L U factorization,
and hence can use forward and back substitution to quickly solve the modified system
f 0 (u(k1) ) v(k+1) = f (u(k) ),

u(k+1) = u(k) + v(k) .

(18.57)

If u(k+1) is still close to u(k1) , we can continue to use f 0 (u(k1) ) as the coefficient matrix
when proceeding on to the next iterate u(k+2) . We continue until there has been a notable
change in the iterates, at which stage we can revert to solving the correct, unmodified
linear system (18.56) by Gaussian elimination. In this version of the algorithm, we update
the coefficient matrix every few iterations, particularly if the value of the approximations
has significantly changed. This device may dramatically reduce the total amount of computation required to approximate the solution to a prescribed accuracy. The down side is
that this quasi-Newton scheme is only linearly convergent, and so does not home in on the
root as fast as the unmodified implementation. The user needs to balance the trade-off
between speed of convergence versus amount of time needed to solve the linear system at
each step in the process.

18.3. Optimization.
We have already remarked on the importance of quadratic minimization principles to
characterize the equilibrium solutions of a variety of linear systems. In nonlinear mathematics, optimization loses none of its centrality, and the wealth of practical applications
3/7/03

816

c 2003

Peter J. Olver

has spawned an entire subdiscipline of applied mathematics. Physical systems naturally


seek to minimize the potential energy function, and so determination of the possible equilibrium configurations requires solving a nonlinear minimization principle. Engineering
design is guided by a variety of optimization constraints, such as performance, safety, cost
and marketability. Non-quadratic minimization principles also arise in the fitting of data
by more general schemes beyond the simple linear least squares approximation method
discussed in Section 4.4. Additional applications arise in economics and financial mathematics one often wishes to minimize costs or maximize profits in manufacturing,
in biological and ecological systems, in pattern recognition and signal processing, and in
statistics.
The Objective Function
Throughout this section, the function F (u) = F (u1 , . . . , un ) to be minimized the
energy, cost, entropy, performance, etc. will be called the objective function. As such,
T
it depends upon one or more variables u = ( u1 , u2 , . . . , un ) that belong to a prescribed
subset R n .
Definition 18.35. A point u? is a global minimum of the objective function on
the domain if
F (u? ) F (u)
for all
u .
(18.58)
The minimum is called strict if
F (u? ) < F (u)

for

u? 6= u .

(18.59)

The point is called a local minimum if the inequality holds just for points u nearby u ? ,
i.e., satisfying k u u? k < for some > 0. Thus, strict local minima are isolated .
The definition of a maximum local or global is the same, but with the reversed
inequality: F (u? ) F (u) or, in the strict case, F (u? ) > F (u). Alternatively, a maximum
of F (u) is the same as a minimum of the negative F (u). Therefore, every result that
applies to minimization of a function can easily be translated into a result on maximization,
which allows us to concentrate exclusively on the minimization problem without any loss
of generality. We will use extremum as a shorthand term for either a maximum or a
minimum.
Remark : As we already noted in Section 4.1, any system of equations can be readily
converted into a minimization principle. Thus, given a system (18.45), we consider the
function
2
2
F (u) = k f (u) k2 = f1 (u1 , . . . , un ) + + fn (u1 , . . . , un ) .
(18.60)
By the basic properties of the norm, the minimum value is F (u) = 0, and this is achieved
if and only if f (u) = 0, i.e., at a solution to the system.

Curiously, the term optimum is not used.

We use the standard Euclidean norm, but any other norm would work equally well here.

3/7/03

817

c 2003

Peter J. Olver

In contrast to the much more complicated existence question for systems of equations,
there is an general theorem that guarantees the existence of minima (and, hence, maxima)
for a very broad class of optimization problems.
Theorem 18.36. If F : R is continuous, and R n is closed and bounded,
then F has at least one global minimum u? .
See [105, 106] for a proof. Although Theorem 18.36 assures us of the existence of a
global minimum of any continuous function on a bounded domain, it does not guarantee
uniqueness, nor does it indicate how to go about finding it. Just as with the solution of
nonlinear systems of equations, it is quite rare that one can find exact formulae for the
minima of non-quadratic functions. Our goal, then, is to formulate practical algorithms
that can accurately compute the minima of general nonlinear functions. A naalgorithm,
but one that is often successfully applied in practical problems, [101, opt], is to select a
reasonably dense set of sample points u(k) in the domain and compare the values of f (u(k) ).
If the points are sufficiently densely distributed and the function is not too wild, this will
give a good approximation to the minimum. The algorithm can be speeded up by using
sophisticated methods of selecting the sample points.
As the student no doubt remembers, there are two different possible types of minima.
An interior minimum occurs at an interior point of the domain of definition of the function,
whereas a boundary minimum occurs on its boundary . Interior local minima are easier
to find, and, to keep the presentation simple, we shall focus our efforts on them.
Let us review the basic procedure for optimizing scalar functions that you learned in
calculus.
Example 18.37. Let us optimize the scalar function
f (u) = 8 u3 + 5 u2 6 u
on the domain 1 u 1. As you learned in first year calculus, the first step to finding
the minimum is to look at the critical points where the derivative vanishes:
f 0 (u) = 24 u2 + 10 u 6 = 0,

and hence

u = 13 , 34 .

To ascertain the local nature of the two critical points, we apply the second derivative test.
Since f 00 (u) = 48 u + 10, we have

f 00 13 = 26 > 0,
whereas
f 00 34 = 26 < 0,

and we conclude that 31 is a local minimum, while 43 is a local maximum.


To find the global minimum and maximum on the interval [ 1, 1 ], we must also take
into account the boundary points 1. Comparing the function values at the four points,

63

f 43 = 16
= 3.9375,
f (1) = 7,
f ( 1) = 3,
f 31 = 31
27 1.148,

we see that 13 is the global minimum, whereas 1 is the global maximum. This is borne out
by the graph of the function in Figure opt1 .
3/7/03

818

c 2003

Peter J. Olver

The Gradient
As the student learns in multi-variable calculus, the (interior) extrema minima
and maxima of a smooth function F (u) = F (u1 , . . . , un ) are necessarily critical points,
meaning places where the gradient of F vanishes. The gradient of a function is, of course,
the vector whose entries are its first order partial derivatives:
F (u) =

F
F
, ... ,
u1
un

(18.61)

Let us, in preparation for more general minimization problems over infinite-dimensional
function spaces, reformulate the definition of the gradient in a more intrinsic manner.
An important but subtle point is that the gradient operator, in fact, relies upon the
introduction of an inner product on the underlying vector space. The standard version
(18.61) is based upon on the Euclidean inner product on R n . Altering the inner product
will change the formula for the gradient!
Definition 18.38. Let V be an inner product space. Given a function F : R
defined on an open domain V , its gradient at a point u is the vector F (u) V
that satisfies

d
h F (u) ; v i =
F (u + t v)
for all
v V.
(18.62)
dt
t=0

The left hand side of (18.62) is known as the directional derivative of F with respect to
v V , typically denoted by F/v.

In the Euclidean case, when F (u) = F (u1 , . . . , un ) is a function of n variables, defined


for u R n , we can use the chain rule to compute
d
d
F (u + t v) =
F (u1 + t v1 , . . . , un + t vn )
dt
dt
F
F
(u + t v) + + vn
(u + t v).
= v1
u1
un

(18.63)

Setting t = 0, the right hand side of (18.62) reduces to

d
F
F
= v1
F (u + t v)
(u) + + vn
(u) = v F (u) = F (u) v.
dt
u1
un
t=0

Therefore, the directional derivative equals the Euclidean dot product between the usual
gradient of the function (18.61) and the direction vector v.
A function F (u) is continuously differentiable if and only if its gradient F (u) is a
continuously varying function of u. This is equivalent to the requirement that the first
order partial derivatives F/ui are all continuous. As usual, we use C1 () to denote the
vector space of all continuously differentiable scalar-valued functions defined on a domain
R n . From now on, all objective functions are assumed to be continuously differentiable
on their domain of definition.
3/7/03

819

c 2003

Peter J. Olver

Remark : In this chapter, we will only deal with the standard Euclidean dot product
and hence the usual gradient (18.61). However, all results can be readily translated into
more general situations, e.g., weighted inner products. Details are outlined in Exercise .
More generally, if u(t) represents a parametrized curve contained within the domain
of definition of F (u), then the instantaneous rate of change in the scalar quantity F as we
move along the curve is given by

d
du
F (u(t)) = F (u) ;
,
(18.64)
dt
dt
which is the directional derivative of F with respect to the velocity or tangent vector

v = u to the curve. For instance, our rate of ascent or descent as we travel through
the mountains is given by the dot product of our velocity vector with the gradient of the
elevation function. This leads us to one important interpretation of the gradient vector.
Theorem 18.39. The gradient F of a scalar function F (u) points in the direction of its steepest increase. The negative gradient, F , which points in the opposite
direction, indicates the direction of steepest decrease.
For example, if F (u, v) represents the elevation of a mountain range at position (u, v)
on a map, then F tells us the direction that is steepest uphill, while F points directly
downhill the direction water will flow. Similarly, if F (u, v, w) represents the temperature
of a solid body, then F tells us the direction in which it is getting the hottest. Heat
energy (like water) will flow in the opposite, coldest direction, namely that of the negative
gradient vector F .
You need to be careful in how you interpret Theorem 18.39. Clearly, the faster you
move along a curve, the faster the function F (u) will vary, and one needs to take this into
account when comparing the rates of change along different curves. The easiest way to

normalize is to assume that the tangent vector a = u has norm 1, so k a k = 1 and we


are passing through the point u with unit speed. Once this is done, Theorem 18.39 is an
immediate consequence of the CauchySchwarz inequality (3.13). Indeed,

when
k a k = 1,
a = | a F | k a k k F k = k F k,
with equality if and only if a = c F points in the same direction as the gradient. Therefore, the maximum rate of change is when a = F/k F k is the unit vector in the direction
of the gradient, while the minimum is achieved when a = F/k F k points in the opposite direction. As a result, Theorem 18.39 tells us how to move if we wish to minimize
a scalar function as rapidly as possible.

Theorem 18.40. A curve u(t) will realize the steepest decrease in the scalar field
F (u) if and only if it satisfies the gradient flow equation

u = F (u).
3/7/03

820

(18.65)
c 2003

Peter J. Olver

Critical Points
Let us now prove that the gradient vanishes at any local minimum of the function.
The most important thing about this proof is that it only relies on the intrinsic definition
of gradient, and therefore applies to any function on any inner product space. Moreover,
even though the gradient can change if we alter the underlying inner product, the condition
that it vanishes at a local extremum does not.
Definition 18.41. A point u? is called a critical point of the objective function F (u)
if
F (u? ) = 0.

(18.66)

Theorem 18.42. If u? is a local (interior) minimum of F (u), then F (u? ) = 0,


and so u? is a critical point.
Proof : Let v R n be any vector. Consider the function
g(t) = F (u? + t v) = F (u?1 + t v1 , . . . , u?n + t vn ),

(18.67)

where t R is sufficiently small to ensure that u? + t v remains inside the domain


of F . Thus, g measures the values of F along a straight line passing through u ? in the
direction prescribed by v. Since u? is a local minimum,
F (u? ) F (u? + t v),

and hence

g(0) g(t)

for all t sufficiently close to zero. In other words, g(t), as a function of the single variable t,
has a local minimum at t = 0. By the basic calculus result on minima of functions of one
variable, the derivative of g(t) must vanish at t = 0. Therefore, by the definition (18.62)
of gradient,

d
?
0
= h F (u? ) ; v i.
F (u + t v)
0 = g (0) =
dt
t=0

We conclude that the gradient vector F (u? ) at the critical point must be orthogonal
to every vector v R n . The only vector that is orthogonal to every vector in an inner
product space is the zero vector, and hence F (u? ) = 0.
Q.E.D.
Remark : As we learned, the gradient vector F points in the direction of the steepest
increase in the function, while its negative, F (u), points in the direction of steepest
decrease. At a minimum of the function, all directions are increasing, and so there is no
direction of steepest decrease. The only way that the gradient can avoid this little dilemma
is for it to vanish, which provides an intuitive explanation of why minima (and maxima)
must be critical points.

If v = 0, then the line degenerates to a point, but the ensuing argument remains (trivially)
valid.

3/7/03

821

c 2003

Peter J. Olver

Thus, provided the objective function is continuously differentiable, every interior


minimum, both local and global, is necessarily a critical point. The converse is not true;
critical points can be maxima; they can also be saddle points or of some degenerate form.
The basic analytical method for determining the (interior) minima of a given function is
to first find all its critical points by solving the system of equations (18.66). Each critical
point then needs to be more closely examined as it could be either a minimum, or a
maximum, or neither.
Example 18.43. Consider the function
F (u, v) = u4 2 u2 + v 2 ,

T
which is defined and continuously differentiable on all of R 2 . Since F = 4 u3 4 u, 2 v ,
its critical points are obtained by solving the system of equations
4 u3 4 u = 0,

2 v = 0.

The solutions to the first equation are u = 0, 1, while the second equation requires v = 0.
Therefore, F has three critical points:

0
1
1
?
?
?
u1 =
,
u2 =
,
u3 =
.
(18.68)
0
0
0
Inspecting the graph in Figure u42u2v2 , we suspect that the first critical point u ?1 is a
saddle point, whereas the other two are both global minima for the function, with the
same value F (u?2 ) = F (u?3 ) = 1. This will be confirmed once we learn how to rigorously
distinguish critical points.
If F (u) is defined on a closed subdomain R n , then its minima may also occur
at boundary points u , and there is no requirement that the gradient vanish at
such boundary minima. The analytical determination of boundary extrema relies on the
method of Lagrange multipliers, and we refer the interested reader to [9, 30]. If the domain
is unbounded, one must also worry about the asymptotic behavior of the function for large
u. In order to keep our presentation simple, we shall relegate these more involved issues
to a more advanced text.
The student should also pay attention to the distinction between local minima and
global minima. Both are critical points. In the absence of theoretical justification, the
only practical way to determine whether or not a minimum is global is to find all the
different local minima and see which one gives the smallest value. In many examples
arising in applications, when F (u) is often an energy function, one knows that the function
is bounded from below, and hence, from general principles, that a global minimum exists,
even when the domain is unbounded.

Numerical methods are discussed below.

3/7/03

822

c 2003

Peter J. Olver

The Second Derivative Test


The status of critical point minimum, maximum, or neither can often be resolved
by analyzing the second derivative of the objective function at the critical point. Let us
first review the one variable second derivative test from first year calculus.
Proposition 18.44. Let g(t) C2 be a scalar function, and suppose t? a critical
point, so g 0 (t? ) = 0. If t? is a local minimum, then g 00 (t? ) 0. Conversely, if g 00 (t? ) > 0,
then t? is a strict local minimum. Similarly, g 00 (t? ) 0 is required at a local maximum,
while g 00 (t? ) < 0 implies that t? is a strict local maximum.
The proof of this result relies on the quadratic Taylor approximation
g(t) g(t? ) +

1
2

(t t? )2 g 00 (t? )

near the critical point, (C.7), where we use the fact that g 0 (t? ) = 0 and so the linear terms
in the Taylor polynomial vanish. If g 00 (t? ) 6= 0, then the quadratic Taylor polynomial
has a minimum or maximum at t? according to the sign of the second derivative. In the
borderline case, when g 00 (t? ) = 0, the second derivative test is inconclusive, and the point
could be either maximum, minimum, saddle point, or degenerate. One must then look at
the higher order terms in the Taylor expansion to resolve the issue; see Exercise .
In multi-variate calculus, the second derivative is represented by the n n Hessian
matrix

2F
2F
2F
...
u21
u1 u2
u1 un

2
2
2
F
F
F

...
2

u u
u
u
u

2
2
1
2
n
2
,
(18.69)
F (u) =

..
..
..
..

.
.
.
.

2F
2F
2F

...
2
un u1 un u2
un
named after the early eighteenth German mathematician Ludwig Otto Hesse. The entries
of the Hessian are the second order partial derivatives of the objective function. If F (u)
C2 has continuous second order partial derivatives, then its Hessian matrix is symmetric,
2 F (u) = 2 F (u)T , which is a restatement of the fact that its mixed partial derivatives
are equal: 2 F/ui uj = 2 F/uj ui , cf. [9, 30]. For the applicability of the second
derivative test, this is an essential ingredient.
The second derivative test for a local minimum of scalar function relies on the positivity of its second derivative. For a function of several variables, the corresponding condition
is that the Hessian matrix be positive definite, as in Definition 3.22. More specifically:
Theorem 18.45. Let F (u) = F (u1 , . . . , un ) C2 () be a real-valued, twice continuously differentiable function defined on an open domain R n . If u? is a
(local, interior) minimum for F , then it is necessarily a critical point, so F (u ? ) = 0.
3/7/03

823

c 2003

Peter J. Olver

Moreover, the Hessian matrix (18.69) must be positive semi-definite at the minimum, so
2 F (u? ) 0. Conversely, if u? is a critical point with positive definite Hessian matrix
2 F (u? ) > 0, then u? is a strict local minimum of F .
Proof : We return to the proof of Theorem 18.42. Given a local minimum u ? , the
scalar function g(t) = F (u? + t v) in (18.67) has a local minimum at t = 0. As noted
above, basic calculus tells us that its derivatives at t = 0 must satisfy
g 0 (0) = 0,

g 00 (0) 0.

(18.70)

The first condition leads to the critical point equation F (u? ) = 0. A straightforward
chain rule calculation produces the formula
00

g (0) =

n
X

i,j = 1

2F
(u? ) vi vj = vT 2 F (u? ) v.
ui uj

As a result, the second condition in (18.70) requires that


vT 2 F (u? ) v 0.

Since this condition is required for every direction v R n , the Hessian matrix 2 F (u? ) 0
satisfies the criterion for positive semi-definiteness, proving the first part of the theorem.
Conversely, if the Hessian 2 F (u? ) > 0 is positive definite, then,
g 00 (0) = vT 2 F (u? ) v > 0

for all

v 6= 0,

and so t = 0 is a strict local minimum for g(t). Since this occurs for every v V , this
implies F (u? ) < F (u) for all u near u? and so u? is a strict local minimum.
Q.E.D.
A maximum requires a negative semi-definite Hessian matrix. If, moreover, the Hessian at the critical point is negative definite, then the critical point is a strict local maximum. If the Hessian matrix is indefinite, then the critical point is a saddle point, and
neither minimum nor maximum. In the borderline case when the Hessian is only positive
or negative semi-definite at the critical point, then the second derivative test is inconclusive. Resolving the nature of the critical point requires more detailed knowledge of the
objective function, e.g., its higher order derivatives.
Example 18.46. As a first, elementary example, consider the quadratic function
F (u, v) = u2 2 u v + 3 v 2 .
To minimize F , we begin by computing its gradient F =

pair of equations F = 0, namely


2 u 2 v = 0,

2u 2v
. Solving the
2u + 6v

2 u + 6 v = 0,

We are ignoring some technical details that need cleaning up for a completely rigorous proof,
which relies on the multivariable Taylor expansion of F (u). See Appendix C.

3/7/03

824

c 2003

Peter J. Olver

we see that the only critical point is the origin u = v = 0. To test whether the origin is a
maximum or minimum, we further compute the Hessian matrix

2 2
Fuu Fuv
2
.
=
H = F (u, v) =
2 6
Fuv Fvv
Using the methods of Section 3.5, we easily prove that the Hessian matrix is positive
definite. Therefore, by Theorem 18.45, u? = 0 is a strict local minimum of F .
Indeed, we recognize F (u, v) to be, in fact, a homogeneous positive definite quadratic
form, which can be written in the form


1 1
u
T
1
= 2 H,
F (u, v) = u K u,
where
K=
.
u=
1 3
v
Positive definiteness of the coefficient matrix K implies that F (u, v) > 0 for all u =
T
( u, v ) 6= 0, and hence 0 is, in fact, a global minimum.
In general, any quadratic function Q(u) = Q(u1 , . . . , un ) can be written in the form
T

Q(u) = u K u 2 b u + c =

m
X

i,j = 1

kij ui uj 2

n
X

bi ui + c,

(18.71)

i=1

where K = K T is a symmetric n n matrix, b R n is a fixed vector, and c R is a


scalar. A straightforward computation produces the formula for its gradient and Hessian
matrix:
(18.72)
Q(u) = 2 K u 2 b,
2 Q(u) = 2 K.
As a result, the critical points of the quadratic function are the solutions to the linear
system K u = b. If K is nonsingular, there is a unique critical point u? , which is a strict
local minimum if and only if K > 0 is positive definite. In fact, Theorem 4.2 tells us
that, in the positive definite case, u? is a strict global minimum for Q(u). Thus, the
algebraic approach of Chapter 4 provides additional, global information that cannot be
gleaned directly from the local, multivariable calculus Theorem 18.45. But algebra is only
able to handle quadratic minimization problems with ease. The analytical classification of
minima and maxima of more complicted objective functions necessarily relies the gradient
and Hessian criteria of Theorem 18.45.
Example 18.47. The function
2

F (u, v) = u + v v

has gradient

F (u, v) =

2u
.
2 v 3 v2

There are two solutions to the critical point equation F = 0: u?1 = ( 0, 0 )


2 T
0, 3 . The Hessian matrix of the objective function is

2
0
2
.
F (u, v) =
0 2 6v
3/7/03

825

c 2003

and u?2 =

Peter J. Olver

u2 + v 2 v 3

u2 + v 4

u2 + v 3

Critical Points.

Figure 18.1.

2 0
0 2

is positive definite. Therefore,

2
2 0
2
the origin is a strict local minimum. On the other hand, F 0, 3 =
is
0 2

T
indefinite, and hence u?2 = 0, 32
a saddle point. The function is graphed in Figure 18.1,
with the critical points indicated by the small solid balls. The origin is, in fact, only a
local minimum, since F (0, 0) = 0, whereas F (0, v) < 0 for all v > 1. Thus, there is no
global minimum or maximum on R 2 .
Next, consider the function

2u
2
4
F (u, v) = u + v ,
with gradient
F (u, v) =
.
4 v3

At the first critical point, the Hessian F (0, 0) =

The only critical point is the origin u = v = 0. The origin is a strict global minimum
T
because F (u, v) > 0 = F (0, 0) for all (u, v) 6= ( 0, 0 ) . However, its Hessian matrix

2
0
2
F (u, v) =
0 12 v 2

2 0
2
, and the second derivative
is only positive semi-definite at the origin, F (0, 0) =
0 0
test is inconclusive.
On the other hand, the origin u = v = 0 is also the only critical point for the function

2u
2
3
.
F (u, v) = u + v
with
F (u, v) =
3 v2
The Hessian matrix is
2

F (u, v) =

2 0
,
0 6v

and so

F (0, 0) =

2 0
0 0

is the same positive semi-definite matrix at the critical point. However, in this case (0, 0)
is not a local minimum; indeed
F (0, v) < 0 = F (0, 0)
3/7/03

826

whenever

v < 0,
c 2003

Peter J. Olver

and so there exist points arbitrarily close to the origin where F takes on smaller values.
As illustrated in Figure 18.1, the origin is, in fact, a degenerate saddle point.
Finally, the function

2u 2v
2
2
,
F (u, v) = u 2 u v + v
has gradient
F (u, v) =
2u + 2v
and so every point u = v is a critical point. The Hessian matrix


Fuu Fuv
2 2
2
F (u, v) =
=
2 2
Fuv Fvv
is positive semi-definite everywhere. Since F (u, u) = 0, while F (u, v) = (u v) 2 > 0 when
u 6= v, each of these critical points is a non-isolated local minimum, but not a strict local
minimum. Thus, comparing the three preceding examples, we see that a semi-definite
Hessian cannot completely distinguish critical points.
Finally, the reader should always remember that first and second derivative tests only
determine the local behavior of the function near the critical point. They cannot be used
to determine whether or not we are at a global minimum. This requires some additional
analysis, and, often, a fair amount of ingenuity.
Minimization of Scalar Functions
In practical optimization, one typically bypasses the preliminary characterization of
minima as critical points, and instead implements a direct iterative procedure that constructs a sequence of successively better approximations. As the computation progresses,
the approximations are adjusted so that the objective function is made smaller and smaller,
which, we hope, will ensure that we are converging to some form of minimum.
As always, to understand the issues involved, it is essential to consider the simplest
scalar situation. Thus, we are given the problem of minimizing a scalar function F (u) on
a bounded interval a u b. The minimum value can either be at an endpoint or an
interior minimum. Let us first state a result that plays a similar role to the Intermediate
Value Theorem 18.19 that formed the basis of the bisection method for finding roots.
Lemma 18.48. Suppose that F (u) is defined and continuous for all a u b.
Suppose that we can find a point a < c < b such that F (c) < F (a) and F (c) < F (b). Then
F (u) has a minimum at some point a < u? < b.
The proof is an easy consequence of Theorem 18.36. Therefore, if we find three points
a < c < b satisfying the conditions of the lemma, we are assured of the existence of a local
minimum for the function between the two endpoints. Once this is done, we can design
an algorithm to home in on the minimum u? . We choose another point, say d between
a and c and evaluate F (d). If F (d) < F (c), then F (d) < F (a) also, and so the points
a < d < c satisfy the hypotheses of Lemma 18.48. Otherwise, if F (d) > F (c) then the
points d < c < b satisfy the hypotheses of the lemma. In either case, a local minimum
has been narrowed down to a smaller interval, either [ a, c ] or [ d, b ]. In the unlikely even
that F (d) = F (c), one can try another point instead unless the objective function is
3/7/03

827

c 2003

Peter J. Olver

constant, one will eventually find a suitable value of d. Iterating the method will produce
a sequence of progressively smaller and smaller intervals in which the minimum is trapped,
and, just like the bisection method, the endpoints of the intervals get closer and closer to
u? .
The one question is how to choose the point d. We described the algorithm when
it was selected to lie between a and c, but one could equally well try a point between
c and b. To speed up the algorithm, it makes sense to place d in the larger of the two
subintervals [ a, c ] and [ c, b ]. One could try placing d in the midpoint of the interval, but
a more inspired choice is to place it at position The result is the Golden Section Method ,
and is outlined in the accompanying
At each stage, the length of the interval
program.

5 1 .61803. Thus, the convergence rate is linear,


has been reduced by a factor of 12
and a bit slower than the bisection algorithm.
Another strategy is to use an interpolating polynomial through the three points on the
graph of F (u) and use the minimum value of that polynomial as the next approximation
to the minimum. According to Exercise , the minimizing value is at
d=

ms nt
,
st

where
s=

F (c) F (a)
,
ca

t=

F (b) F (c)
,
bc

m=

a+c
,
2

n=

c+b
.
2

As long as a < c < b satisfy the hypothesis of Lemma 18.48, we are assured that the quadratic interpolant has a minimum (and not a maximum!), and that the minimu remains
between the enpoints of the interval. If the length of the interval is small, the minimum
value should be a good approximation to the minimizer u? of F (u) itself. Once d is determined, the algorithm proceeds as before. In this case, convergence is not quite guaranteed,
or, in unfavorable situations, could be much slower than in the preceding method. One
can even try using the method when the function values do not satsify the hypothesis
of Lemma 18.48, although now the new point d will not necessarily lie between a and b.
Worse, the quadratic interpolant may have a maximum at d, and one ends up going in the
wrong direction, which can even happen in the minimizing case due to the discrepancy
between it and the objective function F (u). Thus, this case must be handled with more
caution, and convergence of the scheme is much more fraught with danger.
A final idea is to focus not on the objective function F (u) but rather its derivative
f (u) = F 0 (u). The critical points of F are the roots of f (u) = 0, and so one can use one
of the solution methods, e.g., bisection or Newtons method, to find the critical points. Of
course, one must then take care that the critical point u? is indeed a minimum, as it could
equally well be a maximum of the original objective function. (It will probably not be a
saddle point, as these do not correspond to simple roots of f (u).) But this can be checked
by looking at the sign of F 00 (u? ) = f 0 (u? ) at the root; indeed, if we use Newtons method
we will be computing the derivative at each stage of the algorithm, and can stop looking
if the derivative is of the wrojng sign.
3/7/03

828

c 2003

Peter J. Olver

Gradient Descent
Now, let us turn our attention to multi-dimensional optimization problems. We are
seeking to minimize a (smooth) scalar objective function F (u) = F (u1 , . . . , un ). According
to Theorem 18.39, at any given point u in the domain of definition of F , the negative
gradient vector F (u), if nonzero, points in the direction of the steepest decrease in
F . Thus, to minimize F , an evident strategy is to walk downhill, and, to be efficient,
walk downhill as fast as possible, namely in the direction F (u). After walking in this
direction for a little while, we recompute the gradient, and this tells us the new direction
to head downhill. With luck, we will eventually end up at the bottom of the valley, i.e.,
at a (local) minimum value of the objective function.
This simple idea forms the basis of the gradient descent method for minimizing the
objective function F (u). In a numerical implementation, we start the iterative procedure
with an initial guess u(0) , and let u(k) denote the k th approximation to the minimum
u? . To compute the next approximation, we move away from u(k) in the direction of the
negative gradient, and hence
u(k+1) = u(k) tk F (u(k) )

(18.73)

for some positive scalar tk > 0 that indicates how far we move in the negative gradient
direction. We are free to adjust tk so as to optimize our descent path, and this is the key
to the success of the method.
If F (u(k) ) 6= 0, then, at least when tk > 0 is sufficiently small,
F (u(k+1) ) < F (u(k) ),

(18.74)

and so u(k+1) is, presumably, a better approximation to the desired minimum. Clearly, we
cannot choose tk too large or we run the risk of overshooting the minimum and reversing
the inequality (18.74). Think of walking downhill in the Swiss Alps. If you walk too far
in a straight line, which is what happens as tk increases, then you might very well miss
the valley and end up higher than you began not a good strategy for descending to the
bottom! On the other hand, if we choose tk too small, taking very tiny steps, then the
method will converge to the minimum much too slowly.
How should we choose an optimal value for the factor tk ? Keep in mind that the goal
is to minimize F (u). Thus, a good strategy would be to set tk equal to the value of t > 0
that minimizes the scalar objective function

g(t) = F u(k) t F (u(k) )


(18.75)

obtained by restricting F (u) to the ray emanating from u(k) that lies in the negative
gradient direction. Physically, this corresponds to setting off in a straight line in the
direction of steepest decrease in our altitude, and continuing on in this direction until we
cannot go down any further. Barring luck, we will not have reached the actual bottom of
the valley, but must then readjust our direction and continue on down the hill in a series
of straight line paths, each connecting u(k) to u(k+1) .
In practice, one can rarely compute the minimizing value t? of g(t) exactly. Instead,
we use one of the scalar minimization algorithms presented in the previous subsection.
3/7/03

829

c 2003

Peter J. Olver

Note that we only need to look for a minimum among positive values of t > 0, since our
choice of the negatrive gradient direction assures us that, at least for t sufficiently small
and positive, g(t) < g(0).
Example 18.49.
Conjugate Gradients
The one complication with the basic gradient descent method is that it may take a
long time to reach the minimum. This is a danger if the scalar factors t k are small, and we
end up taking very tiny steps in each round of the iteration. This occurs if we are looking
for a minimum in a long narrow valley, as illustrated in Figure valley . The initial step
takes us into the valley, bu then we spend a long time meandering back and forth along
the valley floor before we come close to the true minimum.
One method to avoid such difficulties, and speed up the convergence rate of the scheme,
is to use the method of conjugate directions, modeled on the quadratic minimization procedure discussed in in Section 15.2.

3/7/03

830

c 2003

Peter J. Olver

Chapter 19
Nonlinear Ordinary Differential Equations

Most physical processes are modeled by differential equations. First order ordinary
differential equations, also known as dynamical systems, arise in a wide range of applications, including population dynamics, mechanical systems, planetary motion, ecology,
chemical diffusion, etc., etc. See [15, 59, ODES] for additional material and applications.
To analyze the dynamics of discrete systems, we pass from algebraic equations to initial
value problems for nonlinear ordinary differential equations. We begin with some elementary solution methods, and the basic existence theorems. In physical systems, we observe
the stable equilibria. We demonstrate how stability follows, in most cases, by linearization
of the system around the equilibrium solution.
Our overriding emphasis will be on those properties of solutions that have physical
relevance. Finding a solution to a differential equation is not be so important if that solution never appears in the physical model represented by the system, or is only realized
in exceptional circumstances. Thus, equilibrium solutions, which correspond to configurations in which the physical system does not move, only have a physical relevance if they
are stable. An unstable equilibrium will not appear in practice, since slight perturbations
in the system or its physical surroundings will immediately nudge the system away from
equilibrium, and the instability will mean that it moves far away.
The goal of this chapter is to study and solve initial value problems for nonlinear
systems of ordinary differential equations. Of course, very few nonlinear systems can be
solved explicitly, and so one must typically rely on a suitable numerical scheme in order
to approximate the solution. However, numerical schemes do not always give accurate results. Without some basic theoretical understanding of the nature of solutions, equilibrium
points, and their stability, one would not be able to understand when numerical solutions
(even those provided by standard well-used packages) are to be trusted. Moreover, when
testing a numerical scheme, it helps to have already assembled a repertoire of nonlinear
problems in which one already knows one or more explicit analytic solutions. Further tests
and theoretical results can be based on first integrals (also known as conservation laws) or,
more generally, Lyapunov functions. Although we have only space to touch on these topics
briefly, but, we hope, this will whet the readers appetite for delving into this subject in
more depth. The references [15, Diacu, 59, 65, 69] can be profitably consulted.
Finally, we present a few of the most basic numerical solution techniques for ordinary
differential equations. We begin with Euler, and work up to the RungeKutta fourth order
method, which is one of the most popular methods for everyday applications.
3/7/03

831

c 2003

Peter J. Olver

19.1. First Order Systems of Ordinary Differential Equations.


In this section, we introduce the basic object of study initial value problems for first
order systems of ordinary differential equations. While physical can be modeled by higher
order equations and systems, there is an easy trick that converts all higher order systems
into equivalent first order systems. Thus, we do not lose any generality by focusing our
attention on the first order case. Moreover, almost all numerical solution algorithms are
designed for first order systems, and so to numerically integrate a higher order system, one
also must place it into equivalent first order form.
Scalar Ordinary Differential Equations
As always, to study a new problem, it is essential to begin with the simplest case.
Consider the scalar, first order ordinary differential equation
du
= F (t, u).
dt

(19.1)

The unknown function u(t) represents some dynamical physical quantity that depends
upon the scalar variable t, which usually represents time. Under appropriate conditions on
the right hand side, the solution u(t) is continuously differentiable and uniquely specified
by its value at a single time,
u(t0 ) = u0 .
(19.2)
The combination (19.1), (19.2) is referred to as an initial value problem, and our goal is
to devise both analytical and numerical solution methods.
The simplest class are the autonomous differential equations, which means that the
right hand side does not explicitly depend upon the time:
du
(19.3)
= F (u).
dt
Autonomous scalar equations can be solved by integration. We first divide both sides by
F (u), whereby
1 du
= 1.
F (u) dt
We then integrate with respect to t, and evaluate the left hand integral by a change of
variables, replacing t by u, with du = (du/dt) dt; the result is of the form
Z
Z
Z
du
1 du
G(u) =
=
dt =
dt = t + k,
(19.4)
F (u)
F (u) dt
where k is a constant of integration, while G(u) represents any convenient antiderivative
of the function 1/F (u). Technically a second constant of integration should appear on the
left hand side, but this can be immediately absorbed into the constant c. Equation (19.6)
defines u implicitly as a function of t. We note that this method can be formally viewed
as a separation of variables,
du
= dt,
(19.5)
F (u)
3/7/03

832

c 2003

Peter J. Olver

in which all terms involving u, including its differential du, are collected on the left hand
side of the equation, while all terms involving t and its differential are placed on the right.
The implicit solution (19.4) is then obtained by integrating both sides of the separated
equation (19.5).
If we can solve the implicit equation for u, we obtain the explicit solution
u(t) = H(t + k),

(19.6)

in which where H = G1 is the inverse function. Finally, to satisfy the initial condition
(19.2), we use the implicit form, whereby G(u0 ) = t0 + k. Therefore,

G(u) G(u0 ) = t t0 ,
and hence
u(t) = H t t0 + G(u0 ) .
(19.7)
Example 19.1. Consider the autonomous initial value problem

du
= u2 ,
u(t0 ) = u0 .
dt
To solve the differential equation, we rewrite it in the separated form
Z
1
du
du
= dt,
and then integrate both sides:
=
= t + c.
2
u
u
u2

Solving for u, we deduce the general solution formula


u=

1
.
t+k

To specify c, we evaluate u at the initial time t0 ; this implies


u0 =

1
,
t0 + k

so that

k=

1
t0 .
u0

Therefore, the solution to the initial value problem is


u=

u0
.
1 u0 (t t0 )

(19.8)

Figure utu2 shows the graphs of some typical solutions.


Observe that as t approaches the critical value t? = t0 + 1/u0 , the solution blows up:
u(t) . The blow-up time depends upon the initial data the larger u 0 is, the sooner
the solution goes off to infinity. If the initial data is negative, the solution is well-defined
for all t > t0 , but there is a singularity in the past: t? < t0 . The only solution that exists
for all positive and negative time is the constant solution u(t) 0, corresponding to the
initial condition u0 = 0.
The constant or equilibrium solutions to an autonomous ordinary differential equation
play a particularly important role. If u(t) u? is a constant solution, then du/dt 0, and
hence the differential equation implies that F (u? ) = 0. Therefore, the equilibrium solutions
to (19.3) coincide with the roots of the function F (u). In point of fact, the derivation of our
formula for the solution (19.4) assumed that we were not at an equilibrium point where
3/7/03

833

c 2003

Peter J. Olver

F (u) = 0. In the preceding example, our final solution formula (19.8) to the implicit
equation happened to include the equilibrium solution u(t) 0, but this was a lucky
accident, and one must typically take more care that such solutions do not elude us when
applying the integration method.
Example 19.2. Although a population of people, animals, or bacteria consists of
individuals, the aggregate behavior can often be effectively modeled by a continuous dynamical system. According to the English economist Thomas Malthus, the population of
a species grows, roughly, in proportion to its size. Thus, the number of individuals v(t) in
a species at time t satisfies a first order differential equation of the form
dv
= v,
dt

(19.9)

where the proportionality factor measures the rate of growth, namely the difference
between the birth rate and the death rate. Thus, if births exceed deaths, > 0, and the
population increases, whereas if < 0, more individuals are dying and the population
shrinks.
In the very simplest model, the growth rate is assumed to be independent of the
population size, and we have the simple linear ordinary differential equation (8.3) that we
solved at the beginning of Chapter 8. The solutions satisfy the exponential or Malthusian
growth law v(t) = v0 e t , where v0 = v(0) is the initial population size. Thus, if > 0 the
population grows without limit, while if < 0 the population dies out , v(t) 0, at an
exponentially fast rate. This model is reasonably accurate for an isolated population in an
environment with unlimited resources.
In a more realistic population model, the growth rate can depend upon the size of
the population as well as external environmental factors. For example, in an environment
with limited resources, if the size of the population exceeds the capacity of the resources,
then the growth rate will be negative. Thus, (v) > 0 if v < n, while (v) < 0 for v > n
for some number n > 0 that depends upon the resource availability. The simplest function
of this form is (v) = (n v), where > 0 is a positive constant. This leads to the
nonlinear population model
dv
= v (n v).
(19.10)
dt
In deriving this model, we assumed that the environment is not changing over time; a
dynamical environment would lead to a more complicated non-autonomous differential
equation.
Before analyzing the solutions to the nonlinear population model, let us make a change
of variables, and set v(t) = n u(t), so that u represents the size of the population in
proportion to the carrying capacity n of the environment. Then u(t) satisfies the logistic
differential equation
du
= u (1 u).
(19.11)
dt
This differential equation is the continuous counterpart of the logistic map (18.19). However, unlike its discrete cousin, the logistic differential equation is quite sedate, and its
3/7/03

834

c 2003

Peter J. Olver

solutions easily understood. First, there are two equilibrium solutions: u(t) 0 and
u(t) 1, obtained by setting the right hand side of the equation equal to zero. The first
represents a nonexistent population with no individuals and hence no reproduction. The
second equilibrium solution corresponds to a population v(t) n that is at the ideal size
for the environment, and so deaths exactly balance births. In all other situations, the
population size will vary over time.
To integrate the logistic differential equation, we proceed as above, first writing it in
the separated form
du
= dt.
u(1 u)
Integrating both sides, and using partial fractions,

Z
Z
u
du
du
du
= t + k,
= log
=
+
u(1 u)
u
1u
1u
where k is a constant of integration. Therefore
u
= c e t ,
1u

where

c = ek .

Solving for u, we deduce the solution

u(t) =

c e t
.
1 + c e t

(19.12)

The constant of integration is fixed by the initial condition u(0) = u 0 . Solving


u(0) =

c
= u0
1+c

gives

c=

u0
.
1 u0

Substituting the result back into the solution formula (19.12) and simplifying, we find
u0 e t
.
u(t) =
1 u 0 + u0 e t

(19.13)

The solutions are illustrated in Figure logde . Interestingly, the equilibrium solutions
are not covered by the integration method, but do appear in the final solution formula,
corresponding to initial data u0 = 0 and u0 = 1 respectively.
When using the logistic equation to model population dynamics, the initial data is
assumed to be positive, u0 > 0. As time t , the solution (19.13) tends to the
equilibrium value u(t) 1. For small initial values u0 1 the solution initially grows at
an exponential rate , corresponding to a population with unlimited resources. However,
as the population increases, the gradual lack of resources tends to slow down the growth
rate, and eventually the population saturates at the equilibrium value. On the other hand,
if u0 > 1, the population is too large to be sustained by the resources, and so dies off until
it reaches the same saturation value. If u0 = 0, then the solution remains at equilibrium
u(t) 0. Finally, when u0 < 0, the solution only exists for a finite amount of time, with
u(t) as t t? = log(u0 /(u0 1)). Of course, this final case does not correspond
to a physical situation since we cannot have a negative population!
3/7/03

835

c 2003

Peter J. Olver

A separable ordinary differential equation has the form


du
= a(t) F (u),
dt

(19.14)

in which the right hand side is the product of a function of t only times a function of
u. Although a separable system is not autonomous (unless g is constant), as the name
suggested it can be solved by the same separation of variables method. We rewrite the
equation in the separated form
du
= a(t) dt.
F (u)
Integrating both sides of this equation leads to the solution in implicit form
Z
Z
du
G(u) =
=
a(t) dt = A(t) + k.
F (u)
Solve the implicit equation yields the explicit solution

u(t) = H A(t) + c ,
where

(19.15)

H = G1 .

(19.16)

The integration constant c is then fixed by the initial condition.


Example 19.3. Let us solve the initial value problem
du
= (1 2 t) u,
dt

(19.17)

u(0) = 1.

We begin by writing the differential equation in separated form


du
= (1 2 t) dt.
u
Integrating both sides leads to
log u =

du
=
u

(1 2 t) dt = t t2 + k.

We can readily solve for


u = c exp(t t2 ),
where c = ek . This is the general solution to the differential equation. The initial
2
condition requires that c = 1, and hence u(t) = ett is the unique solution to the initial
value problem. The solution is graphed in Figure ox2 .
First Order Systems
A first order system of ordinary differential equation has the general form
du1
= F1 (t, u1 , . . . , un ),
dt
3/7/03

836

dun
= Fn (t, u1 , . . . , un ).
dt
c 2003

(19.18)
Peter J. Olver

The unknowns u1 (t), . . . , un (t) are functions of the real variable t, which usually represents
time. We shall write the system in vector form
du
= F(t, u),
(19.19)
dt
so that F: R n is a vector-valued function of n + 1 variables defined on an open domain
Rn+1 . By a solution to the differential equation, we mean a vector-valued function
u(t) that is defined and continuously differentiable on an interval a < t < b, and, moreover,
satisfies the differential equation on its interval of definition. The solution u(t) serves to
parametrize a curve C R n , called the solution trajectory or orbit.
A system of differential equations is called autonomous if the right hand side does not
explicitly depend upon the time t, and so has the form
du
= F(u).
(19.20)
dt
One important class of autonomous first order systems are the steady state fluid flows in
two and three dimensions. In this case, F(u) represents the fluid velocity vector field at the
position u. A solution u(t) represents the trajectory of an individual fluid particle. The
differential equation tells us that the fluid velocity at each point of its trajectory matches
the prescribed vector field. Details can be found in Appendices A and B.
In this chapter, we shall be concerned with initial value problems for first order systems
of ordinary differential equations. The general initial conditions are
u1 (t0 ) = a1 ,

u2 (t0 ) = a2 ,

un (t0 ) = an ,

(19.21)

or, in vectorial form, u(t0 ) = a. Here t0 is a prescribed initial time, while the vector
T
a = ( a1 , a2 , . . . , an ) prescribes the initial position of the desired solution. In favorable
situations, the initial conditions serve to uniquely specify a solution to the differential
equations.
An equilibrium solution to an autonomous system of ordinary differential equations
is defined to be a constant solution: u(t) u? for all t. Since the solution is constant,
its derivative must vanish, du/dt 0. Hence, every equilibrium solution corresponds to a
root or solution to the system of algebraic equations
F(u? ) = 0

(19.22)

prescribed by the right hand sides of the equations in the system.


Example 19.4. A predator-prey system is a simplified ecological model of two
species: the predators which feed on the prey. For example, the predators might be
lions in the Serengeti and the prey zebra. Both species obey a population growth model
of the form (19.9), but now the growth rate depends upon the other species. Letting u(t)
represent the number of prey, and v(t) represents the number of predators at time t, then
the dynamical system takes the general form
du
= u,
dt
3/7/03

837

dv
= v.
dt
c 2003

Peter J. Olver

The more prey, i.e., the larger u is, the faster the predators reproduce, while a lack of prey
will cause them to die off. On the other hand, the more predators, the faster the prey
are consumed and the slower the net growth rate. If we assume that the environment has
unlimited resources for the prey, which, barring drought is probably valid in the case of the
zebras, then the simplest model that incorporates these assumptions is the LotkaVolterra
system
du
dv
(19.23)
= u u v,
= v + u v,
dt
dt
corresponding to growth rates = v, = + u. The parameters , , , > 0
are all positive, and their precise values will depend upon the species involved and how
they interact. In particular, represents the unrestrained growth rate of the prey in the
absence of predators, while represents the rate that the predators die off in the absence
of food. The nonlinear terms model the interaction of the two species. The parameters
are determined by field data, along, perhaps, with educated guesses. The initial conditions
u(t0 ) = u0 , v(t0 ) = v0 are the initial populations of the two species.
Let us determine the possible equilibria of the predator-prey system (19.23). According to (19.22), finding the equilibria requires setting the right hand sides of the system to
zero, and solving the resulting algebraic system. We have
0 = u u v = u ( v),

0 = v + u v = v( + u).

Since , , , are all positive, there are two distinct equilibria, namely
u?2 = /,

u?1 = v1? = 0,

v2? = /.

The first is the uninteresting equilibrium where there are no animals no predators and
no prey. The second is more interesting, and indicates a steady value for both populations,
in which the birth rate of the prey is precisely sufficient to continuously feed the predators.
Is this a feasible solution? Or, more mathematically, is this a stable equilibrium? We shall
develop the tools to answer this question below.
Higher Order Systems
Many physical models lead to nonlinear differential equations depending upon higher
order derivatives of the unknowns. There is a standard trick to reduce any higher ordinary
differential equation or higher order system to a first order system. As a result, it suffices
to analyze first order systems; there is no need to develop a separate theory for higher
order systems and equations.
We have already encountered the basic idea in our discussion of the phase plane
approach to second order scalar equations. Given a second order equation

d2 u
du
= F t, u,
,
(19.24)
dt2
dt
We define the function v = du/dt. Since dv/dt = d2 u/dt2 , the functions u, v satisfy the
first order system
du
dv
(19.25)
= v,
= F (t, u, v).
dt
dt
3/7/03

838

c 2003

Peter J. Olver

The initial conditions u(t0 ) = u0 , v(t0 ) = v0 for the first order system translate into

a pair of initial conditions u(t0 ) = u0 , u(t0 ) = v0 specifying the value of the solution
and its first order derivative for the second order equation. It is easy to verify that the
first component of a solution u(t), v(t) to the system defines a solution u(t) to the scalar
equation; conversely, a solution u(t) to the scalar equation induces a solution u(t), v(t) =

u(t) to the system. In this way, the second order equation and the first order system are
entirely equivalent.
Example 19.5. The forced van der Pol equation
du
d2 u
+ (u2 1)
+ u = f (t)
2
dt
dt

(19.26)

arises in the modeling of an electrical circuit with a triode whose resistance changes with
the current, [EE], an other areas, including certain chemical reactions and wind-induced
motions of structures. To convert this second order equation into a first order system, we

set v = u, whence
du
= v,
dt

dv
= f (t) (u2 1)v u.
dt

(19.27)

Similarly, given a third order equation

d3 u
du d2 u
= F t, u,
,
,
dt3
dt dt2
we set
v=

du
,
dt

w=

dv
d2 u
= 2 .
dt
dt

The new variables satisfy the equivalent first order system


du
= v,
dt

dv
= w,
dt

dw
= F (t, u, v, w).
dt

The general construction should now be clear.


Example 19.6. The Newtonian equations for a mass m moving in a potential force
field are a second order system of the form m d2 u/dt2 = F (u) in which u(t) represents
the position of the mass and F (u) the potential function. In components,
d2 u
F
m 2 =
,
dt
u

d2 v
F
m 2 =
,
dt
v

d2 w
F
m
=
.
dt2
w

(19.28)

For example, a planet moving in the suns gravitational field satisfies the system with the
gravitational potential

u
u

v .
,
F (u) =
=
= 2
F (u) =
kuk
k u k3
(u + v 2 + w2 )3/2 w
u2 + v 2 + w 2

(19.29)

3/7/03

839

c 2003

Peter J. Olver

To convert the second order Newtonian equations into a first order system, we let v = u
be the velocity vector, with components p = du/dt, q = dv/dt, r = dw/dt, and so
du
= p,
dt
1 F
dp
=
(u, v, w),
dt
m u

dv
= q,
dt
dq
1 F
=
(u, v, w),
dt
m v

dw
= r,
dt
dr
1 F
=
(u, v, w).
dt
m w

(19.30)

Example 19.7. There is a simple trick for changing any non-autonomous system
into an autonomous system involving one additional variable. Namely, one introduces an
extra coordinate u0 = t to represent the time. The time variable satisfies the elementary
differential equation du0 /dt = 1 with initial condition u(t0 ) = t0 . Thus, the original system
(19.18) can be written in an equivalent autonomous form
du0
= 1,
dt

du1
= F1 (u0 , u1 , . . . , un ),
dt

dun
= Fn (u0 , u1 , . . . , un ).
dt

(19.31)

For example, an autonomous form of the forced van der Pol system is
du0
= 1,
dt

du1
= u2 ,
dt

du2
= f (u0 ) (u21 1)u2 u1 .
dt

(19.32)

19.2. Existence, Uniqueness, and Continuous Dependence.


There is no general analytical method that will solve all differential equations. Indeed,
even relatively simple first order, scalar non-autonomous ordinary differential equations
cannot be solved in closed form. One example is the particular Riccati equation
du
= u2 + t
dt

(19.33)

whose solution cannot be written in terms of elementary functions, although there is a


solution formula that relies on Airy functions, cf. Exercise . The Abel equation
du
= u3 + t
dt

(19.34)

fares even worse, since its general solution cannot be written in terms of known special
functions. Understanding when a given differential equation can be solved in terms of
elementary functions or known special functions is an active area of contemporary research.
In this context, we cannot resist mentioning that the most important class of exact
solution techniques for differential equations are those based on symmetry methods. An
introduction can be found in the first authors graduate level monograph [97]; see also
[Cantwell, 68].
Existence
Before worrying about how to solve a differential equation, either analytically, qualitatively, or numerically, it makes sense to investigate the underlying issues of existence
3/7/03

840

c 2003

Peter J. Olver

and uniqueness. First, does a solution exist? If, not, it makes no sense trying to find
one. Second, is the solution uniquely determined by the initial conditions? Otherwise, the
differential equation does not have much relevance in physical applications since we cannot
use it as a predictive tool.
Unlike partial differential equations, which must be treated on a case-by-case basis,
there are satisfactory general results that answer the existence and uniqueness questions
for almost all initial value problems for systems of ordinary differential equations. We will
not take time to discuss the proofs of these fundamental results, which can be found in
most advanced textbooks on ordinary differential equations, including [15, 59, 65, 69].
Let us begin by stating the fundamental existence theorem.
Theorem 19.8. Let F(t, u) be a continuous function , then the initial value problem
du
= F(t, u),
u(t0 ) = a,
dt
has a solution u = f (t) defined for nearby times | t t0 | < for some > 0.

(19.35)

The existence theorem guarantees that the solution exists at least for times sufficiently close to the initial instant t0 . As we saw, this may be the most that can be said,
although in many systems the maximal interval < t < of existence of the solution
might be much larger even infinite < t < . The interval of existence typically
depends upon both the equation and the particular initial data. For instance, in the elementary Example 19.1, the solutions to the initial value problem only exist up until time
1/u0 , and so the larger the initial data, the shorter the time of existence. It is worth noting
that this phenomenon did not appear in the linear regime, where, barring singularities in
the equation, solutions to a linear ordinary differential equation are guaranteed to exist
for all time.
In practice, we will always extend solutions to the maximal interval of their existence.
If there is a point beyond which the solution cannot be extended, then either the solution
k u(t) k becomes unbounded in a finite time, or, if the right hand side F (t, u) is only
defined on a subset R n+1 , then the solution reaches the boundary in a finite time.
A proof of this fact can be found in the above-mentioned references.
Remark : The existence theorem can be readily adapted to apply to higher order
systems of ordinary differential equations through our trick for converting a higher order
system into a first order system by introducing additional variables. The appropriate initial
conditions are induced from those of the equivalent first order system, as in the second
order example (19.24) discussed above.
Uniqueness
As important as existence is the question of uniqueness. Does the initial value problem have more than one solution? While continuity of the function F(t, u) is enough to
If F(t, u) is only defined on a domain R n+1 , then we must assume that the initial
conditions (t0 , a) belong to the domain of definition.

3/7/03

841

c 2003

Peter J. Olver

guarantee that a solution exists, it is not quite enough to ensure uniqueness of the solution
to the initial value problem. The difficulty can be appreciated by looking at an elementary
example.
Example 19.9. Consider the nonlinear initial value problem
du
3
3
(19.36)
u,
u(0) = 0.
=
dt
2

Since the right hand side F (u) = 32 3 u is continuous, Theorem 19.8 assures us of the
existence of a solution. This autonomous scalar equation can be easily solved by separation
of variables:
Z
2 du

= u2/3 = t + c,
and so
u = (t + c)2/3 .
3
3 u
Substituting into the initial condition implies that c = 0, and hence
u(t) = t3/2
is a solution to the initial value problem, at least provided t 0, since otherwise our
solution is not a real-valued function.
On the other hand, since the right hand side vanishes at u = 0, the constant function
u(t) 0
is an equilibrium solution to the differential equation. (Note that in this case the separation
of variables solution formula did not recover the equilibrium solution.) Moreover, the
equilibrium solution also has the initial value u(0) = 0. Therefore, we have constructed
two different solutions to the initial value problem (19.36). Uniqueness is not valid! Worse
yet, there are in fact an infinite number of solutions to the initial value problem. Given
any a > 0, the function

0,
0 t a,
(19.37)
u(t) =
3/2
(t a) ,
t a,
is differentiable everywhere, even at t = a. Moreover, it satisfies both the differential
equation and the initial condition. Several of the solutions are plotted in Figure nonu .

In conclusion, to ensure uniqueness of solutions, we need to impose a stronger restriction than mere continuity on the differential equation. The proof of the following basic
uniqueness theorem can be found in the above references.
Theorem 19.10. If F(t, u) C1 is continuously differentiable, then there exists one
and only one solution to the initial value problem (19.35).
Thus, the difficulty with the differential equation (19.36) is that the function F (u) =

3
u, although continuous everywhere, is not differentiable at u = 0, and hence the
uniqueness theorem does not apply. On the other hand, F (u) is continuously differentiable
away from u = 0, and so any nonzero initial condition u(t0 ) = u0 6= 0 will produce a
unique solution for as long as it remains away from the problematic value u = 0.
3
2

3/7/03

842

c 2003

Peter J. Olver

Remark : While having continuous partial derivatives is sufficient to guarantee uniqueness, this condition can, in fact, be slightly weakened. It suffices to require that F(t, u) is
continuous as a function of t and satisfies the Lipschitz condition
k F(t, v) F(t, u) k C(t) k v u k

(19.38)

for all t, u, v and some positive C(t) > 0. See (18.18) above and the subsequent discussion
for more details on Lipschitz continuity.
Blanket Hypothesis: From now on, all differential equations must satisfy the uniqueness criterion that their right hand side is continuously differentiable, or, at least, satisfies
the Lipschitz inequality (19.38).
One important consequence of the uniqueness theorem is that a solution u(t) to an
autonomous system of ordinary differential equations is either in equilibrium, not varying

in time, so u 0, or is moving at all times where defined, i.e., u 6= 0 everywhere. In other


words, it is mathematically impossible for a solution to reach an equilibrium position in
a finite amount of time although it may well approach equilibrium in an asymptotic
fashion as t . Physically, once the solution gets sufficiently close to equilibrium, we
do not observe its motion; mathematically there is always a slight motion as the solution
get closer and closer, never quite reaching the equilibrium until the end of time.
Proposition 19.11. If u(t) is any solution to an autonomous ordinary differential
equation such that u(t? ) = u? at some time t? , then u(t) u? is the equilibrium solution.
Proof : We regard u(t? ) = u? as initial data for the given solution u(t) at the initial
time t? . Since F(u? ) = 0, the constant function u? (t) u? is a solution of the differential
equation that satisfies the same initial conditions. Therefore, by uniqueness, the solution
in question has to agree with it.
Q.E.D.
The system in question is subject to our blanket uniqueness hypothesis. Otherwise,
the result is false. For example, the function
u(t) =

(t? t)3/2 ,

0,

t t? ,

t t? ,

is a solution to the scalar ordinary differential equation (19.36) that reaches equilibrium,
u? = 0, in a finite time.
Although a solution cannot reach equilibrium in a finite time, it can certainly have
a well-defined limiting value. It can be proved that such a limit point is necessarily an
equilibrium solution. Details of the method of proof can be found in the above-mentioned
references.
Proposition 19.12. u(t) is any solution to an autonomous ordinary differential
equation such that lim u(t) = u? , then u? is an equilibrium solution to the system.
t

3/7/03

843

c 2003

Peter J. Olver

The same conclusion holds if we run time backwards: if

lim

u(t) = u? , then

u? is also an equilibrium point. Of course, having a limit point is but one of a variety of
limiting behaviors of solutions to ordinary differential equations. solutions can also become
unbounded, can approach periodic orbits, or be completely chaotic, depending upon the
nature of the system and the initial conditions.
Continuous Dependence
In a physical applications, it is rare, if not infeasible, to be able to prescribe the initial
conditions exactly. Rather, experimental and physical errors will only allow us to say that
the initial conditions are approximately equal to those in our mathematical model. Thus,
we need to be sure that a small error in our initial measurements do not produce a large
effect in the solution. A similar argument can be made for any physical parameters, e.g.,
masses, charges, frictional coefficients, etc., that appear in the differential equation itself.
If we change them slightly, this should not have a dramatic effect on the solution.
Mathematically, what we are after is a criterion of continuous dependence of solutions
upon both initial data and parameters. Fortunately, the desired result holds without
any additional assumptions on the differential equation, other than requiring that the
parameters appear continuously. We state both results in a single theorem.
Theorem 19.13. Consider an initial value problem problem
du
(19.39)
= F(t, u, ),
u(t0 ) = a(),
dt
in which the differential equation and/or the initial conditions depend continuously upon
one or more parameters = (1 , . . . , k ). Then the unique solution u(t, ) depends
continuously upon the parameters.
Example 19.14. Let us return to the initial value problem
du
= u2 ,
u(0) = u0 + ,
dt
that we considered in Example 19.1. We regard as a small perturbation of our original
initial data u0 , and as a variable parameter in the equation. The solution is
u(t, ) =

u0 +
.
1 (u0 + ) t

Note that, where defined, this is a continuous function of both parameters , . Thus, a
small change in the initial data, or in the equation, produces a small change in the solution
at least for times near the initial time.
Continuous dependence does not preclude
from eventually becoming
nearby solutions

?
far apart. Indeed, the blow-up time t = 1/ (u0 + ) for a solution depends upon both
the initial data and the parameter in the equation. Thus, as we approach blow up, solutions
that started out very close to each other will get arbitrarily far apart; see Figure cd .

We continue to impose our blanket uniqueness hypothesis.

3/7/03

844

c 2003

Peter J. Olver

In light of this example, the continuous dependence of solutions upon parameters


does not prevent solutions to the ordinary differential equation from being chaotic and/or
having sensitive dependence on initial conditions. A very tiny change in the initial
conditions has a negligible initial effect upon the solution, but over longer time intervals
the differences between the two solutions can be dramatic. Further development of these
ideas can be found in [7, 36] and elsewhere.

19.3. Stability.
The only steady state solutions that one directly observes in a physical system are
the stable equilibria. Unstable equilibria are unsustainable in any realistic situation, and
will disappear when subjected to even the tiniest perturbation, e.g., outside traffic jarring the experimental apparatus. Thus, finding the equilibrium solutions to a system of
ordinary differential equations is only half the battle; one must then understand their
stability properties in order to characterize those that can be realized in normal physical
circumstances.
We shall exclusively work with autonomous systems
du
= F(u)
dt

(19.40)

in our presentation. We assume throughout that F(u) is continuously differentiable, so as


to ensure the uniqueness of solutions to the initial value problem.
As we noted in Proposition 19.12, if a solution tends to a single point as t ,
then that point must be an equilibrium solution. If every solution that starts out near a
given equilibrium solution tends to it, the equilibrium is called asymptotically stable. If the
solutions that start out nearby stay nearby, then the critical point is stable. More formal
definitions follow.
Definition 19.15. An equilibrium solution u? to an autonomous system of first
order ordinary differential equations is called
(i ) stable if for every > 0 there exists a > 0 such that if k u0 u? k < , then the
solution u(t) with initial conditions u(0) = u0 satisfies k u(t) u? k < for all
t t0 .
(ii ) asymptotically stable if u? is stable and, in addition, there exists 0 > 0 such that if
k u0 u? k < 0 , then u(t) u? as t .
In Chapter 8, we investigated the stability of equilibrium solutions to constant coefficient (i.e., autonomous) linear systems of ordinary differential equations. As we shall
see, except in borderline situations, the same stability criteria carry over to equilibrium
solutions of nonlinear ordinary differential equations. In essence, we approximate the nonlinear system near an equilibrium point by its linearization, just as in the discrete case
discussed in Section 18.1.
Example 19.16. As we saw, the logistic differential equation (19.11) has two equilibrium solutions, corresponding to the two solutions to the equation u(1 u) = 0. The
first equilibrium solution u?1 = 0 is unstable, since all nearby solutions go away from it at
3/7/03

845

c 2003

Peter J. Olver

an exponentially fast rate. On the other hand, the other equilibrium solution u ?2 = 1 is
asymptotically stable, since any solution with initial condition 0 < u 0 tends to it at an
exponentially fast rate. See the solution graphs in Figure utu2 for details.
Example 19.17. Consider an autonomous (meaning constant coefficient) homogeneous linear planar system

du
= a u + b v,
dt

dv
= c u + d v.
dt

a b
Let A =
be the coefficient matrix. The origin u? = v ? = 0 is an evident equilibc d
rium, solution, and, moreover, is the only equilibrium provided ker A = {0}. In Section 8.7,
we extensively analyzed the stability of the origin equilibrium for all possibilities. The answer depended upon the eigenvalues of the coefficient matrix A. The origin is (globally)
asymptotically stable if and only if both eigenvalues are real and negative. The origin is
stable, but not asymptotically stable if and only if both eigenvalues are purely imaginary,
or if A = O and has two zero eigenvalues. In all other cases, the origin is unstable. Below
we will see how this simple linear analysis has direct bearing on the stability problem for
nonlinear planar systems.
Stability of Scalar Differential Equations
Before looking at any further examples, we need to develop some basic mathematical
tools for analyzing the stability of equilibria. The stability analysis for scalar ordinary
differential equations
du
= F (u)
(19.41)
dt
is particularly easy. As note above, we assume throughout that F C 1 is continuously
differentiable.
The first observation is that all non-equilibrium solutions u(t) are strictly monotone
functions, meaning they are either always increasing or always decreasing. If F (u) > 0,

then, according to the equation, u > 0 and hence u(t) is increasing. Vice versa, solutions
are decreasing at any point where F (u) < 0. Consequently, any non-monotone solution
would have to pass through an equilibrium value where F (u? ) = 0. But this would violate
Proposition 19.11, which implies that the only solution taking on an equilibrium value is
the equilibrium solution u(t) u? . This proves the claim.
Since a non-equilibrium solution u(t) is everywhere monotone, there are only three
possible things that it can do:
(a) It becomes unbounded at some finite time: u(t) or as t t? .
(b) It exists for all t t0 , but become unbounded as t .
(c) It exists for all t t0 and has a limiting value, u(t) u? as t , which, by
Proposition 19.12 must be an equilibrium of the equation.
Let us look more carefully at the last eventuality. Let u? be an equilibrium point, so
F (u? ) = 0. Suppose that F (u) > 0 for u slightly below u? . Any solution that starts out
below, but sufficiently close to u? must be increasing. Moreover, Proposition 19.11 implies
3/7/03

846

c 2003

Peter J. Olver

that u(t) < u? for all t since it cannot pass the equilibrium without violating uniqueness.
Therefore, u(t) is a solution of type (c), and hence must have limiting value u ? , being the
only equilibrium solution it can increase to. Therefore, in this situation, the equilibrium
point u? is asymptotically stable from below ; solutions that start out slightly below return
to it in the limit. On the other hand, if F (u) < 0 for u slightly below u ? , then any solution
that start out in this regime will be monotonically decreasing, and so goes away from the
equilibrium point, which is thus unstable from below .
By the same reasoning, if F (u) < 0 for u slightly above u? , then such solutions will
be monotonically decreasing, bounded from below by u? , and hence have no choice but
to tend to u? in the limit. Under this condition, the equilibrium point is asymptotically
stable from above. The reverse inequality corresponds to solutions that increase away from
u? , which is thus unstable from above. Combining the two stable cases produces the basic
condition for asymptotic stability of scalar ordinary differential equations.
Theorem 19.18. A equilibrium point u? of an autonomous scalar differential equation is asymptotically stable if and only if F (u) > 0 for u? < u < u? and F (u) < 0 for
u? < u < u? + .
If the inequalities are reversed, the equilibrium point is unstable; all nearby solutions
will go away from the equilibrium. The two cases are illustrated in Figure as1 . An
equilibrium point where F (u) is of one sign on both sides, e.g., the point u ? = 0 for
F (u) = u2 , is stable from one side, and unstable from the other.
Example 19.19. Consider the differential equation
du
= u u3 .
(19.42)
dt
solving the algebraic equation F (u) = u u3 = 0, we find that the equation has three
equilibria: u?1 = 1, u?2 = 0, u?3 = + 1, The graph of the function F (u) = u u3 switches
from positive to negative to at the first equilibrium point, which proves its stability. Similarly, the graph goes back from positive to negative at u?2 = 0, proving the instability of
the second equilibrium. Finally, the third equilibrium is stable because F (u) changes from
negative to positive there.
With this information coupled with monotonicity, we can completely characterize the
behavior of all solutions to the system. Any solution with negative initial condition u 0 < 0
will end up, asymptotically, at the first equilibrium, u(t) 1 as t . Indeed, if
u0 < 1, then u(t) is monotonically increasing to 1, while if 1 < u 0 < 0, the solution
is decreasing towards 1. On the other hand, if u0 > 0 the solution ends up at the last
equilibrium, u(t) + 1; those with 0 < u0 < 1 are monotonically increasing, while those
with 1 < u0 are decreasing. The only solution that does not end up at either 1 or + 1 as
t is the unstable equilibrium solution u(t) 0.
Thus, the sign of the function F (u) nearby an equilibrium determines its stability.
In most instances, this can be checked by looking at the derivative of the function at the
equilibrium. If F 0 (u? ) < 0, then we are in the stable situation, whereas if F 0 (u? ) > 0,
then we are unstable on both sides. The borderline case where F 0 (u? ) = 0 requires further
analysis to resolve the stability of the equilibrium in question.
3/7/03

847

c 2003

Peter J. Olver

Theorem 19.20. Let u? be a equilibrium point for a scalar ordinary differential

equation u = F (u). If F 0 (u? ) < 0, then u? is asymptotically stable. If F 0 (u? ) > 0, then u?
is unstable.
Thus, in the preceding example, F 0 (u) = 1 3 u2 , and we compute its value at the
equilibria:
F 0 ( 1) = 2 < 0,

F 0 (0) = 1 > 0,

F 0 (1) = 2 < 0.

The signs reconfirm our conclusion that 1 are stable equilibria, while 0 is unstable.
Theorem 19.20 is not quite as powerful as the direct test in Theorem 19.18, but does
have the advantage of being a bit easier to use, and, more significantly, generalizing to
systems of ordinary differential equations. In the borderline case when F 0 (u? ) = 0, the
derivative test is inconclusive. For example, the equations du/dt = u 3 and du/dt = u3
both satisfy F 0 (0) = 0 at the equilibrium point u? = 0. But, using the criterion of
Theorem 19.18, we conclude that the former has an unstable equilibrium, while the latter
is stable.
Linearization and Stability
In higher dimensional situations, we can no longer rely on monotonicity properties
of solutions, and a more sophisticated approach to the stability of equilibrium solutions
is required. The key idea is already contained in the second characterization of stable
equilibria in Theorem 19.20. The derivative F 0 (u) determines the slope of its tangent line,
which is the linear approximation to the function F (u) at the equilibrium point. The
stability of the linearization of the differential equation at the equilibrium point can be
determined using the techniques from Chapter 8. In most situations, linear stability or
instability carries over to the corresponding nonlinear system.

Let us revisit the scalar case u = F (u) from this point of view. Linearization of a
scalar function at a point means to replace it by its tangent line approximation or first
order Taylor polynomial
F (u) F (u? ) + F 0 (u? )(u u? )

(19.43)

If u? is an equilibrium point, then F (u? ) = 0 and the first term disappears. Therefore, we
expect that, near the equilibrium point, the solutions to the nonlinear ordinary differential
equation (19.41) should be well approximated by its linearization
du
= F 0 (u? )(u u? ).
dt
Let us rewrite this equation in terms of the function v(t) = u(t) u? which represents the
deviation of the solution from equilibrium. Since dv/dt = du/dt, the linearized equation
takes the elementary form
dv
= a v,
dt
3/7/03

where
848

a = F 0 (u? )

(19.44)
c 2003

Peter J. Olver

is the value of the derivative at the equilibrium point. Note that the original equilibrium
point u = u? corresponds to the zero equilibrium point v = v ? = 0 of the linearized
equation.
We already know that the linear differential equation (19.44) has an asymptotically
stable equilibrium at v ? = 0 if and only if a = F 0 (u? ) < 0, while for a = F 0 (u? ) > 0
the origin is unstable. Thus, Theorem 19.20 tells us that the stability properties of the
nonlinear equilibrium u? is almost entirely governed by those of its linearization (19.43).
The one exception is the borderline case where a = F 0 (u? ) = 0. Here, the linearization,

namely v = 0, has a stable equilibrium at v ? = 0, but this fact is not sufficient to guarantee
stability of the nonlinear equation. When the linearized system is only neutrally stable,
the neglected nonlinear terms can shift us into either a stable or an unstable situation, and
we are in need of more sophisticated analytical tools, e.g., a higher order Taylor expansion,
to fully resolve the situation.
The same linearization idea can be applied to analyze the stability of an equilibrium
solution u? to a first order autonomous system
du
= F(u).
dt
We approximate the function F(u) at an equilibrium point where F(u? ) = 0 by its first
order Taylor expansion
F(u) F(u? ) + F 0 (u? )(u u? ) = F 0 (u? )(u u? ),

(19.45)

where F 0 (u? ) denotes its nn Jacobian matrix (18.27) at the equilibrium point. Thus, the
deviation from equilibrium v(t) = u(t)u? should be governed by the linear approximation
dv
= A v,
dt

where

A = F 0 (u? ).

(19.46)

According to Theorem 8.41, the linearized approximation has an asymptotically stable


zero solution if and only if all the eigenvalues of the coefficient matrix A = F 0 (u? ) have
negative real part. On the other hand, if one or more eigenvalues has positive real part,
then the zero solution is unstable. It can be proved, [59, 65], that these conditions is also
sufficient for asymptotic stability and instability in the nonlinear case.
Theorem 19.21. Let u? be an equilibrium point for the first order ordinary differ
ential equation u = F(u). If all eigenvalues of the Jacobian matrix F 0 (u? ) have negative
real part, Re j < 0, then u? is asymptotically stable. If, on the other hand, F 0 (u? ) has
one or more eigenvalues with positive real part, then u? is an unstable equilibrium.
The borderline case occurs when one or more of the eigenvalues is either 0 or purely
imaginary, i.e., Re j = 0, while all other eigenvalues have negative real part. In these
cases, the linearization test is inconclusive, and we need more detailed information (which
may not be easy to come by) on how the nonlinear terms might affect any borderline
eigenvalues lying on the imaginary axis. Their effect may be to nudge the eigenvalue into
the left half plane, stabilizing the solutions, or into the right half plane, destabilizing them.
3/7/03

849

c 2003

Peter J. Olver

Example 19.22. The second order ordinary differential equation


d
d2
+

+ sin = 0
(19.47)
dt2
dt
describes the damped oscillations of a rigid pendulum that rotates on a pivot under a
gravitational force. The unknown function (t) measures the angle of the pendulum from
the vertical, as illustrated in Figure pend . The constant m > 0 is the mass of the
pendulum bob, > 0 the coefficient of friction, assumed here to be strictly positive, and
> 0 the restoring gravitational force.
In order to study the solutions and their stability, we begin by converting this equation
into a first order system by setting
m

u(t) = (t),

v(t) =

d
,
dt

(19.48)

and so
dv

du
(19.49)
= v,
= sin u v,
where
=
,
=
,
dt
dt
m
m
are both positive constants. The equilibria occur where the right hand sides of the first
order system (19.49) simultaneously vanish:
v = 0,

sin u v = 0,

and hence

u = 0, , 2 , . . . .

Thus, the system has infinitely many equilibrium points u?k = (k , 0) for k = 0, 1, 2, . . ..

The equilibrium point u?0 = (0, 0) corresponds to = 0, = 0, which means that the
pendulum is at rest at the bottom of its arc. Our physical intuition leads us to expect
this to describe a stable configuration, as the frictional effects will eventually damp out
small motions of the pendulum. The next equilibrium u?1 = (, 0) corresponds to = ,

= 0, which means that the pendulum stays motionless at the top of its arc. Theoretically,
this is a possible equilibrium configuration, but highly unlikely to be observed in practice
and thus should be unstable. Now, since u = is an angular variable, equilibria whose u
values differ by an integer multiple of 2 define the same physical configuration, and hence
should have identical stability properties. Therefore, the remaining equilibria u ?k physically
correspond to one or the other of these two possible equilibrium positions; when k = 2 j is
even, the pendulum is at the bottom, while when k = 2 j + 1 is odd, the pendulum is at
the top.
Let us now confirm our intuition using the linearization stability criterion of Theorem 19.21. The right hand side of the system, namely

v
0
1
0
F(u, v) =
, has Jacobian matrix F (u, v) =
.
sin u v
cos u
At the bottom equilibrium u?0 = (0, 0), the Jacobian matrix
p

2 4
0
1
has eigenvalues
=
F 0 (0, 0) =
.

2
3/7/03

850

c 2003

Peter J. Olver

Under our assumption that , > 0, the eigenvalues both have negative real part, and the
origin is a stable equilibrium. If 2 < 4 the underdamped case the eigenvalues are
complex with negative real part, and hence, in the terminology of Section 8.7, the origin is
a stable focus. In the phase plane, the solutions spiral in to the focus, which corresponds
to a pendulum with damped oscillations of decreasing magnitude. On the other hand, if
2 > 4 , then the system is overdamped, and the origin is a stable node. In this case,
the solutions decay exponentially fast. Physically, this would be like a pendulum moving
in a vat of molasses. In both cases, the phase portrait of the nonlinear motion near the
equilibrium position closely matches the linearized problem. The same analysis holds at
all even multiples of which really represent the same bottom equilibrium point.
On the other hand, at the top equilibrium u?1 = (, 0), the Jacobian matrix
0

F (0, 0) =

has eigenvalues

p
2 + 4
.
2

In this case, one of the eigenvalues is real and positive while the other is negative. Therefore,
the linearized system has a saddle point, and so the nonlinear system is also unstable at
this equilibrium point. Any tiny perturbation of a pendulum that is standing upright will
cause it to fall down. Eventually, friction will cause it to return to the the stable bottom
equilibrium. Again, the nonlinear system near the unstable equilibrium closely matches
the linear saddle point phase portrait.
The complete phase portrait of an underdamped pendulum appears in Figure dpen .
Note that, as advertised, almost all solutions end up spiraling into the stable equilibria.
Solutions with a large initial velocity end up spinning a number of times around the
center, but eventually frictional forces win out and the pendulum ends up in a damped
oscillatory mode. The unstable equilibria have the same basic saddle shape as their linear
counterparts. Each gives rise to two special solutions in which the pendulum spins around
a few times, and, in the t limit, ends up upright at the unstable equilibrium position.
However, this solution is practically impossible to achieve in a physical environment as any
tiny perturbation e.g., a breath of air will cause the pendulum to sightly deviate and
then end up decaying into the usual damped oscillatory motion at the bottom.
A deeper analysis shows that equilibria whose eigenvalues do not lie on the imaginary
axis, so Re j 6= 0 for all j, are structurally stable. This means that not only are the stability properties dictated by the linearized approximations, but, nearby the equilibrium point,
solutions to the nonlinear system are slight perturbations of those of the corresponding
linearized system. For instance, stable foci of the linearized system correspond to stable
foci of the nonlinear counterpart, while unstable saddle points remain saddle points, although the saddle rays are slightly curved as they depart from the equilibrium. In other
words, the structural stability of linear systems, as discussed at the end of Section 8.7 also
carries over to the nonlinear regime near an equilibrium. The general statement of this
important result is known as the Center Manifold Theorem, and the complete statement
and proof can be found, for instance, in [59, 65].
3/7/03

851

c 2003

Peter J. Olver

Example 19.23. Consider the unforced van der Pol system


du
dv
(19.50)
= v,
= (u2 1) v u.
dt
dt
that we derived in Example 19.5. The only equilibrium point is at the origin u = v = 0.
Computing the Jacobian matrix of the right hand side,

0
1
0 1
0
0
F (u, v) =
,
hence
F (0, 0) =
.
2uv 1 1
1 1

The eigenvalues are 21 (1 i 3 ), and correspond to an unstable focus of the linearized


system near the equilibrium point. Therefore, the origin is an unstable equilibrium for
nonlinear van der Pol system. Solutions starting out near 0 spiral away. On the other
hand, it can be shown that solutions that are sufficiently far away from the origin spiral
in.
So what happens to the solutions? As illustrated in the phase plane portrait Figure vdp , all of the solutions spiral towards a stable periodic orbit, known as a limit cycle
for the system. Any initial data will eventually end up following the periodic orbit as
it circles around the origin. Proof of the existence of a limit cycle relies on the more
sophisticated PoincareBendixson Theory for planar autonomous systems; see [59, 65].
Example 19.24. The nonlinear system
dv
du
= u (v 1),
= 4 u2 v 2 ,
dt
dt

has four equilibria: (0, 2) and ( 3 , 1). The Jacobian matrix for the system is

v1
u
0
F (u, v) =
.
2u 2v

A table of the eigenvalues at the equilibrium points and their stability follows: A complete
phase portrait can be found in Figure exxxx
Equilibrium Point

Jacobian matrix

(0, 2)
(0, 2)

( 3 , 1)

( 3 , 1)

3/7/03

1 0
0 4
3
0

3
3

0
6

3
2

3
2

852

Eigenvalues

Stability

1, 4

unstable
saddle

3, 6

unstable
saddle

1 i
1 i

stable
focus

stable
focus

c 2003

Peter J. Olver

Conservative Systems
When modeling a physical system that includes some form of damping due to
friction, viscosity, or dissipation the linearization test for stability of equilibria will usually suffice. However, when dealing with conservative systems, when there is no damping
and so energy is preserved, the test is usually inconclusive, and one must rely on alternative stability criteria. In many instances, one can exploit the conservation of energy for
this purpose. We return to our general philosophy that minimizers of an energy function
should be stable (but not necessarily asymptotically stable) equilibria. Thus, we discover
that minimization principles retain their centrality, even for nonlinear physical systems.
Classically, a quantity such as the energy that is conserved, which means that it is
constant on solutions to a system of differential equations, is known as a first integral or
conservation law . First, let us state the basic definition.

Definition 19.25. A first integral of an autonomous system u = F(u) is a realvalued function I(u) which is constant on solutions.
The constant value of the first integral will depend upon the solution, and is fixed
by whatever value it assumed at the initial time t0 . In other words, a first integral must
satisfy the condition
I(u(t)) = I(u(t0 ))
(19.51)
whenever u(t) is a solution to the differential equation. Therefore, every solution to the
dynamical system is constrained to move along a single level set { I(u) = c } of the first
integral I. Any constant function I(u) c is trivially a first integral, but it carries no
information whatsoever about the solutions, and so is uninteresting. We will call any
autonomous system that possesses a nontrivial first integral I(u) a conservative system.
How do we find first integrals? In applications, one often appeals to physical principles
such as conservation of energy, conservation of linear or angular momentum, conservation
of mass, and so on. Mathematically, the most convenient way to check whether a function
is constant is to verify that its derivative is identically zero. Thus, differentiating (19.51)
with respect to t and making use of the chain rule leads to the basic condition
du
d
I(u(t)) = I(u(t))
= I(u(t)) F(u(t)).
(19.52)
dt
dt
The final expression is the directional derivative, cf. (18.62), of I(u) with respect to the
vector field v = F(u) that specifies the differential equation. Writing out (19.52) in detail,
we find that a first integral I(u1 , . . . , un ) must satisfy a first order linear partial differential
equation
I
I
F1 (u1 , . . . , un )
+ + Fn (u1 , . . . , un )
= 0.
(19.53)
u1
un
0=

As such, it looks harder to solve than the original ordinary differential equation! Usually,
one is forced to rely on either physical intuition, intelligent guesswork, symmetry properties, or, as a last resort, luck to find first integrals. A deeper fact, due to the pioneering

In fact, the general solution method of such partial differential equations, [ 97 ], relies on the
integration of ordinary differential equations. But then we are back to where we started!

3/7/03

853

c 2003

Peter J. Olver

twentieth century mathematician Emmy Noether, cf. [97], is that first integrals and conservation laws are the result of underlying symmetry properties of the differential equation.
Like many nonlinear methods, it remains the subject of contemporary research.
Let us specialize to the case of a planar autonomous system
du
= F (u, v),
dt

dv
= G(u, v).
dt

(19.54)

According to (19.53), a first integral I(u, v) of this system must satisfy the linear partial
differential equation
I
I
+ G(u, v)
= 0.
(19.55)
F (u, v)
u
v
This nonlinear first order partial differential equation can be solved as follows . To solve,
we consider the auxiliary first order scalar ordinary differential equation
G(u, v)
dv
=
du
F (u, v)

(19.56)

for v = h(u) as a function of u. Note that the latter ordinary differential equation can
be formally obtained by dividing the second equation in the original system (19.54) by
the first, and then canceling the time differentials dt. Suppose we can write the general
solution to the scalar equation (19.56) in the form
I(u, v) = c,

(19.57)

where c is a constant of integration. the claim is that I(u, v) is then a first integral of
the original system (19.54). Indeed, differentiating (19.57) with respect to u and using the
chain rule, we find
0=

d
I
dv I
I
G(u, v) I
I(u, v) =
+
=
+
.
du
u du v
u F (u, v) v

Clearing the denominator, we conclude that I(u, v) solves the first integral equation (19.55),
which justifies our claim.
Example 19.26. As an elementary example, consider the linear equation
du
= v,
dt

dv
= u.
dt

(19.58)

To construct a first integral, we construct auxiliary equation (19.56), which is


u
dv
= .
du
v

See Section 21.1 for an alternative perspective.

We assume that F (u, v) 6 0. Otherwise, u = c is itself a first integral, and the system reduces
to a scalar equation for v.

3/7/03

854

c 2003

Peter J. Olver

This first order ordinary differential equation can be solved by separating variables:
v dv = u du,

and hence

1
2

u2 + 12 v 2 = c,

where c is the constant of integration. Therefore, by the preceding result,


I(u, v) =

1
2

u2 + 12 v 2

is a first integral. The level sets of I(u, v) are the circles centered at the origin, and we
recover the fact that the solutions of (19.58) go around the circles. The origin is a stable
equilibrium a center.
This example hints at the importance of first integrals in stability theory. The following key result confirms our general philosophy that energy minimizers, or, more generally,
minimizers of first integrals, are necessarily stable equilibria.
Theorem 19.27. Let I(u) be a first integral for the autonomous system of ordinary

differential equations u = F(u). If u? is a strict local minimum of I, then u? is a stable


equilibrium point for the system.
Proof : We first prove that u? is an equilibrium. Indeed, the solution u(t) with initial
condition u(t0 ) = u? must maintain the value of I(u(t)) = I(u? ). But, by definition of a
strict minimum, there are no points near u? that have the same value of I, and hence, by
continuity, the solution cannot leave u? .
To prove stability, we set
M (r) = max { I(u) | k u u? k r } ,

m(r) = min { I(u) | k u u? k = r } .

Thus M (r) is the maximum value of the integral over a ball of radius r centered at the
minimum, while m(r) is the minimum over its boundary sphere of radius r. Since I is
continuous, so are m and M . Since u? is a local minimum, M (r) m(r) > I(u? ) for
0 < r < sufficiently small.
Given > 0 sufficiently small, we can choose a > 0 such that M () < m(). Then,
if u(t0 ) = u0 satisfies k uo u? k , then I(u0 ) M (). But I(u(t)) is fixed, and so the
resulting solution u(t) cannot cross the sphere of radius , since all points v on the sphere
have a strictly larger value of I(v) m() > M () I(u0 ). Therefore, k u(t) u? k < ,
and hence we have fulfilled the Definition 19.15 of stability.
Q.E.D.
Remark : The proof of Theorem 19.27, in fact, does not reply upon the fact that u ?
is a minimum! Indeed, we reach exactly the same conclusion at a strict local maximum of
the first integral I, and so they are also stable equilibria. Or, to phrase it another way,
maxima of I(u) are minima of its negative I(u), which is also a first integral. Saddle
points, however, are rarely stable. While at first sight, this may appear to contradict our
intuition, the fact is that energy functions typically do not have maxima. Indeed, the
energy is typically the sum of kinetic and potential contributions. While potential energy

We write as if the norm is the Euclidean norm, but any other norm will work equally well
for this proof.

3/7/03

855

c 2003

Peter J. Olver

can admit maxima, e.g., the pendulum at the top of its arc, these are only saddle points for
the full energy function, since the kinetic energy term can always be increased by moving
a bit faster.
Example 19.28. Consider the specific predator-prey system
du
dv
= 2 u u v,
= 9 v + 3 u v,
dt
dt
modeling populations of lions and zebra, which is a special case of (19.23). According to
Example 19.4, there are two possible equilibria:
u?2 = 3, v2? = 2.

u?1 = v1? = 0,

Let us try to determine their stability by the linearization criterion. The Jacobian matrix
for the system is

2v
u
0
.
F (u, v) =
3v
3u 9
At the first, trivial equilibrium,

2 0
0
F (0, 0) =
,
0 9

with eigenvalues 2 and 9.

Since there is one positive and one negative eigenvalue, the origin is an unstable saddle
point. On the other hand, at the nonzero equilibrium, the Jacobian matrix

0 3
0
,
has purely imaginary eigenvalues
3 2 i.
F (3, 2) =
6 0
Since they are purely imaginary, the linearized system has a stable center. But as we are
in a borderline situation, Theorem 19.21 cannot be applied, and the linearization stability
test is inconclusive.
It turns out that the predator-prey model has a first integral, and so represents a
conservative system. Following (19.56), to find it, we must solve the auxiliary equation.
dv
9v + 3uv
9/u + 3
=
=
.
du
2u uv
2/v 1

Fortunately, this is a separable first order ordinary differential equation. Integrating,

Z
Z
9
2
+ 3 du = 9 log u + 3 u + c,
2 log v v =
1 dv =
v
u
where c is the constant of integration. Writing the solution in the form (19.57), we conclude
that
I(u, v) = 9 log u 3 u + 2 log v v = c,
is a first integral of the system. The solutions to the system must stay on the level sets of
I. Note that

9/u 3
,
and hence
I(3, 2) = 0,
I(u, v) =
2/v 1
3/7/03

856

c 2003

Peter J. Olver

which shows that the second equilibrium is a critical point. (The zero equilibrium is a
singularity.) Moreover, the Hessian matrix at the critical point

3 0
2
I(3, 2) =
.
0 1
T

is negative definite, and hence u?2 = ( 3, 2 ) is a strict local maximum of the integral
I(u, v). Thus, Theorem 19.27 (rephrased for maxima) proves that the equilibrium point is
stable. Moreover, the nearby level sets of I are closed curves, and so, while the linearization
test is inconclusive, the equilibrium point does in fact turn out to be a stable center.
The first integral serves to completely characterize the qualitative behavior of the
predator-prey system. In the physically relevant region, i.e., the upper right quadrant Q =
{ u > 0, v > 0 } where both populations are positive, with the exception of the equilibrium
T
point u?2 = ( 3, 2 ) itself, all of the level sets of the first integral are closed curves encircling
u?2 . Thus, all non-equilibrium solutions to the system are periodic, circling around the
stable equilibrium along the level curves illustrated in Figure prey . Thus, in such an
idealized ecological model, the zebra, u, and lion, v, populations maintain a balance over
the long term, but vary periodically between maximum and minimum values. Observe also
that the maximum and minimum values of the two populations are not achieved at the
same time. Starting with a small number of predators, the number of prey will initially
increase. The predators then have more food, and so also increase in numbers. At a
certain critical point, the predators are sufficiently numerous as to kill prey faster than
they can reproduce. At this point, the prey population has reached its maximum, and
begins to decline. But it takes a while for the predator population to feel the effect, and
they continue to increase in numbers. However, eventually the increasingly rapid decline
in the number of prey begins to affect the predators, which reach a maximum population
subsequent to that of the prey. After this, both populations are in decline. Eventually,
enough predators have died off so as to relieve the pressure on the prey, whose population
bottoms out, and then slowly begins to rebound. A bit later, the number of predators also
reaches a minimum, at which point the entire growth and decay cycle starts over again.
Such periodic phenomena are observed, roughly, in many natural ecological systems.
The period of the population cycle depends upon how far away from the stable equilibrium it lies. Near equilibrium, the solutions are close to those ofthe linearized system
which,
in view of the eigenvalues, are periodic, with frequency 3 2, and hence a fixed
period 2 /3. However, solutions that are far away from equilibrium have much longer
periods, and so greater imbalances between lions and zebras leads to longer periods, and
more radically varying numbers of the two populations. Understanding the mechanisms
behind these mathematical cycles is becoming increasingly important in the proper management of natural resources.
Example 19.29. In our next example, we look at the undamped oscillations of a
pendulum. When we set the friction coefficient = 0, the nonlinear second order ordinary
differential equation (19.47) becomes
m
3/7/03

d2
+ sin = 0.
dt2
857

(19.59)
c 2003

Peter J. Olver

As before, we convert the equation into a first order system


dv
= sin u,
dt

du
= v,
dt

(19.60)

where

,
and
=
.
dt
m
The equilibria, u?k = (n , 0) for n = 0, 1, 2, . . . , are the same as in the damped case.
The pendulum is either at the top (n even) or the bottom (n odd) of the circle.
Let us first look at the linearized stability test. In this case, the Jacobian matrix of
(19.60) is

0
1
0
.
F (u, v) =
cos u 0
u(t) = (t),

v(t) =

At the upper equilibria u?2 k+1 = ( (2 k + 1) , 0 ) ,

0 1
0
has real eigenvalues
F ((2 k + 1) , 0) =
0

and hence these equilibria are unstable saddle points, just as in the damped version. On
T
the other hand, at the bottom equilibria u?2 k = ( 2 k , 0 ) , the Jacobian matrix

0
1
0
F (2 k , 0) =
,
has purely imaginary eigenvalues
i .
0

Therefore, without the benefit of damping, the linearized stability test is inconclusive, and
the stability of the bottom equilibria remains in doubt.
Since we are dealing with a conservative system, the total energy of the pendulum
2
m d
2
1
+ (1 cos )
(19.61)
E(u, v) = 2 m v + (1 cos u) =
2 dt

should provide us with a first integral. Note that E is a sum of two terms, which represent,
respectively, the kinetic energy due to the motion, and the potential energy due to the
height of the pendulum bob. To verify that E(u, v) is indeed a first integral, we compute
dE
dv
du

= mv
+ sin u
= m v sin u + v sin u = 0,
since
=
.
dt
dt
dt
m
Therefore, E is indeed constant on solutions, reconfirming the physical basis of the model.
The phase plane solutions to the pendulum equation move along the level sets of the
energy function E(u, v), which are plotted in Figure pen . The critical points are the
equilibria; these are where

n
sin u
?
= 0,
and hence
u = un =
E(u) =
0
mv

In a physical system, the potential energy is only defined up to an additive constant. Here
we have fixed the zero energy level to be at the bottom of the pendulums arc.

3/7/03

858

c 2003

Peter J. Olver

for some integer n. To characterize the critical points, we appeal to the second derivative
test, and so evaluate the Hessian

cos u 0
2
.
E(u, v) =
0
m

0
?
2
is positive
At the bottom equilibrium points u2 k , the Hessian E(2 k , 0) =
0 m
definite, since and m are positive constants. Therefore, the bottom equilibria are strict
local minima of the energy, and so Theorem 19.27 guarantees their stability. The upper
equilibrium points u?2 k+1 are saddle points for the energy function since their Hessian

0
2
is indefinite. Indeed, the phase portrait of the nonlinear
E((2 k + 1) , 0) =
0 m
pendulum nearby the unstable equilibria looks like a perturbed version of a linear saddle
point.
Each stable equilibrium is surrounded by a family of closed elliptically-shaped level
curves, and hence forms a center. Each closed curve corresponds to a periodic solution of
the system, in which the pendulum moves back and forth.
Near the equilibrium, the period
is close to that of the linearized system, namely 2 / as predicted by the eigenvalues.
This fact underlies the use of pendulum-based clocks in time keeping, first recognized by
Galileo. Grandfather clocks keep accurate time because the amplitude of the oscillations
of their pendula are small. However, as we move away from the equilibrium point, the
solutions with very large amplitude oscillations, in which the pendulum becomes nearly
vertical, have much longer periods.
The limiting case of the periodic solutions is of particular interest. The pair of curves
connecting two distinct unstable equilibria are known as the homoclinic orbits, and play
an essential role in the more advanced analysis of the pendulum under perturbations.
Physically, a homoclinic orbit corresponds to a pendulum that starts out just shy of vertical,
goes through exactly one full rotation, and eventually (as t ) ends up vertical again.
The existence of a homoclinic orbit implies that a periodically forced pendulum exhibits
chaotic behavior, [36, 7]; see also Exercise .
Finally, the level sets lying above and below the cats-eyes formed by the periodic
orbits are known as the running orbits.
Since u = is a 2 periodic angular variable, the

T
running orbits ( u(t), v(t) ) = ((t), (t))T , in fact, also correspond to periodic physical
motions, in which the pendulum rotates around and around its pivot point. Since energy
is conserved, the rotations persist forever. The larger the total energy E(u, v), the farther
away from the uaxis the level set, and the faster the pendulum spins.
In summary, the nature of a solution to the pendulum equation is almost entirely
characterized by its energy:
E = 0,
0 < E < 2 ,
E = 2 ,
E > 2
3/7/03

stable equilibria,
oscillating orbits,
unstable equilibria and homoclinic orbits,
running orbits
859

c 2003

Peter J. Olver

Example 19.30. The equations governing the rotation of a rigid body around a
fixed point are known as the Euler equations of rigid body mechanics. According to
Exercise , the eigenvectors of the positive definite inertia tensor of the body prescribe
the three mutually orthogonal principal axes of rotation. The corresponding eigenvalues
0 < I1 < I2 < I3 are the principal moments of inertia of the body. Let u1 (t), u2 (t), u3 (t)
denote the angular momenta of the body around its three principal axes. In the absence
of external forces, the dynamical system governing a rotating body takes the relatively
simple form
du2
du3
I I3
I I1
I I2
du1
u2 u3 ,
u1 u3 ,
u1 u2 . (19.62)
= 2
= 3
= 1
dt
I2 I3
dt
I1 I3
dt
I1 I2
This system models, for example, the dynamics of a satellite spinning in its orbit around
the earth. The solution will prescribe the rotations of the satellite around its center of
mass, but not the motion of the center of mass as the satellite orbits the earth.
The equilibria of the system are where the right hand sides simultaneously vanish,
which require that either u2 = u3 = 0 or u1 = u3 = 0 or u1 = u2 = 0. In other words,
every point on the three coordinate axes is an equilibrium configuration! Since the u i
represent angular momenta, the equilibria correspond to the body spinning around one of
its principal axes at a fixed angular velocity. Let us look at the stability of these equilibrium
configurations. The linearization test fails completely as it must do whenever dealing
with a non-isolated equilibrium point. But the Euler equations turn out to have two
independent first integrals:

u22
u23
1 2
1 u21
(19.63)
+
+
,
A(u) =
E(u) =
u1 + u22 + u23 .
2 I1
I2
I3
2
The first is the total kinetic energy, while the second is the total angular momentum. The
proof that dE/dt = 0 = dA/dt for any solution is left to the reader.
Since both E and A are constant, the solutions to the system are constrained to
move along a common level set { E = e, A =a }. Thus, the solution curves are given
by intersecting the sphere A = a of radius 2 a with the ellipsoid where E = e. In
Figure rigid , we have graphed the intersection curves of a fixed sphere with a family of
ellipsoids corresponding to different values of the kinetic energy. The six equilibria on the
sphere are at its intersections with the coordinate axes. Those on the x and z axes are
surrounded by closed periodic orbits, and hence are stable equilibria; indeed, they are,
respectively, local minima and maxima of the energy when restricted to the sphere. On
the other hand, the pair of equilibria on the y axis have the form of a saddle point, and
so are unstable. Thus, a body that spins around its principal axes corresponding to the
smallest or the largest moments of inertia is stable, whereas one that spins around the
axis corresponding to the intermediate moment of inertia is unstable. This mathematical
deduction can be demonstrated physically by flipping a solid rectangular object, e.g., this
book, up into the air. It is easy to arrange it to spin around its long axis or its short axis
in a stable manner, but it will balk at attempts to make it rotate around its middle axis!
The Lyapunov Method
Nonconservative systems particularly those incorporating damping and/or fric3/7/03

860

c 2003

Peter J. Olver

tional effects do not typically have first integrals. Physically, the energy of a damped
system is expected to be a decreasing function of time. Eventually, the systems returns
to an equilibrium position, and the extra energy has been dissipated away. However, this
physical law has implications for the behavior of solutions. In particular, it can also be used
to prove stability, even in cases when the linearization stability test is inconclusive. The
nineteenth century Russian mathematician Alexander Lyapunov was the first to pinpoint
the importance of such functions.
Definition 19.31. A function L(u) is known as a Lyapunov function for the first

order system u = F(u) if it satisfies


d
L(u(t)) 0
dt

for all solutions

(19.64)

u(t).

It is worth pointing out that one can verify the Lyapunov inequality (19.64) without
actually having to solve the system. Namely, by the same chain rule computation as used
to establish the first integral criterion (19.52), we find
d
L(u) = L(u) F(u) 0
dt

for all

u.

However, unlike first integrals which can, at least in principle, be systematically found by
solving a first order partial differential equation, finding Lyapunov functions is more like
an art form, usually relying on physical intuition or inspired guesswork.
The Lyapunov inequality (19.64) implies that a Lyapunov function must be decreasing,
L(u(t)) L(u(t0 ))

for all

t > t0 ,

when evaluated on any solution to the system. The proof of Theorem 19.27 can be readily
adapted to prove stability of a system with a Lyapunov function. Details can be found in
[59, 65].
Theorem 19.32. If L(u) is a Lyapunov function for the autonomous system of

ordinary differential equations u = F(u) and u? is a strict local minimum of L, then u?


is a stable equilibrium point for the system. If the Lyapunov inequality (19.64) is strict,
then the minimum u? is, in fact, asymptotically stable.
In a damped mechanical system, the energy is decreasing, and so plays the role of a
Lyapunov function. Unlike first integrals, maxima of Lyapunov functions are not stable.
Example 19.33. Return to the planar system
du
= v,
dt

dv
= sin u v,
dt

describing the damped oscillations of a pendulum, as in (19.49). Physically, we expect


that the damping will cause a continual decrease in the total energy in the system, which,
by (19.61) is
E = 12 m v 2 + (1 cos u).
3/7/03

861

c 2003

Peter J. Olver

We compute its time derivative, when u(t), v(t) is a solution to the damped system. Recalling that = /m, = /m, we find
dv
du
dE
= mv
+ sin u
= m v ( sin u v) + v sin u = v 2 0,
dt
dt
dt
since we are assuming that the frictional coefficient > 0. Therefore, the energy satisfies
the Lyapunov stability criterion, and hence Theorem 19.32 re-establishes the stability of
the energy minima u = 2 k , v = 0, where the damped pendulum is at the bottom of the
arc.

19.4. Numerical Solution Methods.


Since we are not able to explicitly solve the vast majority of differential equations,
the design of suitable numerical algorithms for accurately approximating the solutions is
an essential component of the applied mathematicians toolbox. Owing to the importance
of differential equations in a host of applications, a tremendous effort has gone into the
development of numerical solution methods, some dating back to the beginnings of the
calculus. Nowadays, many types of computer packages are available for numerically solving
ordinary differential equations. All give reliable and accurate results for a broad range of
systems, at least for solutions over moderately long time periods. However, all of these
packages and the underlying methods have their limitations, and it is essential that one
be able to to recognize when the methods are working as advertised, and when they are
giving spurious results!
In this section, we concentrate on numerical methods for initial value problems.
(Boundary value problems are dealt with in Sections 10.6 and cvfe.) We shall introduce a few of the most basic methods, culminating in the very popular RungeKutta
fourth order method. This will only serve as an introduction to the subject, and many
other useful methods can be found in more specialized texts, [node]. Some equations are
more difficult to accurately approximate than others, and a variety of more specialized
methods are employed when confronted with a more recalcitrant system. Of course, an
important feature is to decide whether a numerical solution is actually approximating the
true solution. Here is where the theory, particularly the classification of equilibria and
their stability properties, as well as first integrals and Lyapunov functions, can play an
essential role. Explicit solutions, when known, can also be used as test cases for tracking
the reliability and accuracy of a chosen numerical scheme.
Eulers Method
The key issues already appear when confronting the simplest first order scalar equation
du
(19.65)
= F (t, u),
u(t0 ) = u0 .
dt
To keep matters simple, we will concentrate on the scalars case. However, the methods
are all phrased in a manner that allows them to be readily adapted to first order systems
just replace the scalar functions u(t) and F (t, u) by vectors u and F(t, u) throughout.
(The time t, of course, remains a scalar.) Higher order ordinary differential equations are
3/7/03

862

c 2003

Peter J. Olver

almost always handled by first converting them into an equivalent first order system, as
discussed in Section 19.1, and then applying the numerical methods thereunto.
We begin with the very simplest method for solving the initial value problem (19.65),
that is named after the prolific eighteenth century Swiss mathematician Leonhard Euler
although Newton and contemporaries were well aware of such a simple technique. Eulers method is rarely used because much more accurate techniques can be implemented
with minimal additional work. Nevertheless, the method lies at the core of the entire subject, and must be thoroughly understood before progressing on to the more sophisticated
algorithms that are used in realistic computations.
Starting at the initial point t0 , we introduce mesh points
t0 < t 1 < t 2 < t 3 < ,
continuing on until we reach a desired final time tn = t? . The mesh points tk should be
fairly closely spaced. In our presentation, we will always adopt a uniform step size, and so
h = tk+1 tk > 0,

(19.66)

does not depend on k and is assumed to be relatively small. This assumption serves to
simplify the analysis, and does not significantly affect the underlying ideas. For a uniform
step size, the k th mesh point is at tk = t0 + k h. More sophisticated adaptive methods,
in which the step size is adjusted in order to maintain accuracy of the numerical solution,
can be found in more advanced texts, e.g., [node].
A numerical algorithm will recursively compute approximations
uk u(tk )
to the sampled values of the solution at the chosen mesh points. The error in the approximation at time tk will be denoted by
ek = u(tk ) uk

(19.67)

and the goal is to make each error as small as possible. If required, the values of the solution
u(t) between mesh points may be computed by a subsequent interpolation procedure, e.g.,
based upon cubic splines.
Eulers method begins with the standard first order Taylor approximation to the
solution. Thus, we approximate u(t) near the mesh point tk by its tangent line
u(t) u(tk ) + (t tk ) u0 (tk ) = u(tk ) + (t tk ) F (tk , u(tk )),
where we replace the derivative du/dt by the right hand side of the governing differential
equation (19.65). In particular, the approximate value of the solution at the subsequent
mesh point is
u(tk+1 ) u(tk ) + (tk+1 tk ) F (tk , u(tk )).
(19.68)
This simple idea forms the basis of Eulers method.
3/7/03

863

c 2003

Peter J. Olver

Since in practice we only know the approximation uk to the value of u(tk ) at the
current mesh point, we are forced to replace u(tk ) by its approximation uk , and thereby
convert (19.68) into the iterative scheme
uk+1 = uk + (tk+1 tk ) F (tk , uk ).

(19.69)

In particular, for a uniform step size (19.66), Eulers method takes the form
uk+1 = uk + h F (tk , uk ).

(19.70)

As sketched in Figure Euler , the method starts off approximating the solution reasonably
well, but gradually loses accuracy as the errors accumulate.
To understand how Eulers method works in practice, we begin by looking at a problem
we know how to solve. As usual, the best way to test a numerical solution method is to
try it on a problem with a known solution, since then we can determine exactly how large
the resulting approximation error is.
Example 19.34. The simplest nontrivial ordinary differential equation is
du
= u,
u(0) = 1.
dt
The solution to the initial value problem is, of course, the exponential function
u(t) = et .
Since F (t, u) = u, Eulers method (19.70) with a fixed step size h > 0 takes the form
uk+1 = uk + h uk = (1 + h) uk .
This linear iterative equation is easy to solve:
uk = (1 + h)k u0 = (1 + h)k ,
which is our proposed approximation to the solution u(tk ) = etk at the mesh point tk = k h.
Therefore, by adopting the Euler scheme to solve the differential equation, we are effectively
approximating the exponential function
etk = ek h (1 + h)k
by a power. When we replace the mesh time tk = k h by t, we recover, in the limit, a
well-known formula:
k

t
t
t/h
.
e = lim (1 + h)
= lim
1+
h0
k
k
The student familiar with the theory of compound interest, [int], will recognize this particular approximation. As the time interval of compounding h gets smaller and smaller,
the amount in the savings account approaches an exponential. Note particularly that the
smaller the step size, the larger the number of steps required to reach a given time. Thus,
for time t = 1 we need k = 10 steps of size h = .1, but k = 1000 steps of size h = .001.
3/7/03

864

c 2003

Peter J. Olver

How good is the resulting approximation? The error


e(t) = uk et

at time

t = tk = k h,

measures the difference between the true solution and its numerical approximation. Let
us tabulate the error at the particular times t = 1, 2 and 3 for various values of the step
size h. The actual solution values are
e1 = e = 2.718281828 . . . ,

e2 = 7.389056096 . . . ,

e3 = 20.085536912 . . . .

In this case, the approximate solution always underestimates the true solution.

e(1)

e(2)

e(3)

.1
.01
.001
.0001
.00001

.125
.0134
.00135
.000136
.0000136

.662
.0730
.00738
.000739
.0000739

2.636
.297
.0301
.00301
.000301

Some key observations:


(i ) The further t is away from the initial point t0 = 0, the larger the magnitude of the
error for a given step size.
(ii ) On the other hand, the smaller the step size, the smaller the error. The trade-off is
that more computational effort is required to produce the numerical approximation.
(iii ) The error is more or less in proportion to the step size. Decreasing the step size by
1
a factor of 10
decreases the error by a similar amount.
The final observation is indicative of the fact that the Euler method is of first order , which
means that the error depends linearly on the step size h. More specifically, at a fixed
time t, the error is bounded by
| e(tk ) | = | uk u(tk ) | C(tk ) h,

(19.71)

for some positive C(t) > 0 that depends upon the time and the particular solution, but
not on the step size. A more detailed discussion will appear shortly.

In this case, there is an explicit formula for the numerical solution. However, in almost any
other situation, one cannot compute the approximation uk without having first determined the
intermediate values u0 , . . . , uk1 .

See our earlier discussion of the order of iterative methods for motivation.

3/7/03

865

c 2003

Peter J. Olver

Example 19.35. The solution to the initial value problem


du
= (1 2 t) u,
u(0) = 1.
dt
was found in Example 19.3 by the method of separation of variables:
u(t) = exp(t t2 ).

(19.72)

(19.73)

Eulers method uses the scheme


uk+1 = uk + h (1 2 tk ) uk ,

u0 = 1,

(19.74)

to approximate the solution. The following table lists the errors e(t k ) = uk u(tk ) between
the values computed by the Euler scheme and the actual solution values
u(1) = 1.00000000,

u(2) = 0.13533528,

u(3) = 0.00247875,

(19.75)

for several step sizes.


h

e(1)

e(2)

e(3)

0.1000
0.0100
0.0010
0.0001

0.08684799
0.00836810
0.00083368
0.00008334

0.00649610
0.00046864
0.00004528
0.00000451

0.00159986
0.00018384
0.00001857
0.00000186

As in the previous example, each decrease in step size by a factor of 10 leads to one
additional decimal digit of accuracy in the computed solution. The down side is that the
amount of computation has correspondingly increased by a factor of 10. In Figure ox2e
we compare the graphs of the actual and numerical solutions.
Taylor Methods
In general, the order of a numerical solution method governs both the accuracy of
its approximations and the speed at which they converge to the true solution. Although
the Euler method is simple and easy to implement, it is only a first order method, and
therefore of rather limited utility for efficiently computing accurate approximations. Thus,
there is a great need to devise reasonably simple but high order methods to accurately and
rapidly approximate the solutions to ordinary differential equations.
Our original derivation of the Euler method was based on a first order Taylor approximation to the solution. An evident way to design a higher order method is to employ a
higher order Taylor approximation. The Taylor series expansion for the solution u(t) at
the succeeding mesh point tk+1 = tk + h has the form
u(tk+1 ) = u(tk + h) = u(tk ) + h
3/7/03

866

du
h 2 d2 u
(tk ) +
(t ) + .
dt
2 dt2 k
c 2003

(19.76)
Peter J. Olver

du
= F (t, u) using the differential
As we just saw, we can evaluate the first derivative term
dt
equation. The second derivative term can be found by differentiating with respect to t.
Invoking the chain rule,
d2 u
d du
d
F
F
du
=
=
F
(t,
u(t))
=
(t,
u)
+
(t,
u)
dt2
dt dt
dt
t
u
dt
(19.77)
F
F
(2)
=
(t, u) +
(t, u) F (t, u) F (t, u).
t
u
Substituting the resulting formula into (19.76) leads to the second order Taylor method
h2 (2)
F (tk , uk )
2

F
h2
F
(t , u ) +
(t , u ) F (tk , uk ) ,
= uk + h F (tk , uk ) +
2
t k k
u k k

uk+1 = uk + h F (tk , uk ) +

(19.78)

in which, as before, we replace the solution value u(tk ) by the known approximation uk .
The resulting method is of second order, meaning that the error function satisfies the
quadratic estimate
| e(tk ) | = | uk u(tk ) | C(tk ) h2 ,
(19.79)
which is proportional to the square of the step size.
Example 19.36. Let us explicitly formulate the second order Taylor method for the
initial value problem (19.17). Here
du
= F (t, u) = (1 2 t) u,
dt
and so (19.78) becomes

h2
(19.80)
u0 = 1.
2 uk + (1 2 tk )2 uk ,
2
A graph of the resulting numerical solution with h = .1, compared with the exact solution
(19.73) and its Euler approximation appears in Figure ox2t . The following table lists the
errors between the values computed by the second order Taylor scheme and the actual
solution values, as given in (19.75).
uk+1 = uk + h (1 2 tk ) uk +

e(1)

e(2)

e(3)

0.100
0.010
0.001

0.0001933681
0.0000001917
0.0000000002

0.0005535446
0.0000045963
0.0000000452

0.0002389282
0.0000017850
0.0000000174

Thus, in accordance with the second order error estimate (19.79), a decrease in the step
1
1
leads in an increase in accuracy of the solution by a factor 100
, i.e.,
size by a factor of 10
3/7/03

867

c 2003

Peter J. Olver

an increase in 2 significant decimal places in the numerical approximation of the solution.


(For some unexplained reason, the errors at t = 1 become smaller even faster.)
Higher order methods can be obtained by including additional terms in the expansion
(19.76). For example, to derive a third order Taylor method, we include the third order
term where we evaluate the third derivative by differentiating (19.77), and so
d3 u
d d2 u
d (2)
F (2)
=
=
F
(t,
u)
=
+
dt3
dt dt2
dt
t
2
2F
F
2F
2 F
+
2
F
+
F
+
=
2
2
t
t u
u
t
The resulting third order Taylor method is

F (2) du
F (2)
F (2)
=
+ F
u dt
t
u
2

F
F
F (3) (t, u).
+F
u
u

(19.81)

h2 (2)
h3 (3)
F (tk , uk ) +
F (tk , uk ),
(19.82)
2
6
where the last two summand are given by (19.77), (19.81), respectively. The higher order
expressions are even worse, and a good symbolic manipulation system is almost essential.
(Although, in the past, mathematicians were able to perform these sorts of computations
by hand!)
Although higher order Taylor methods are easy to motivate, they are rarely used in
practice. There are two principal difficulties:
(a) Owing to their dependence upon the partial derivatives of F (t, u), they require the
right hand side of the differential equation to be rather smooth.
(b) Even worse, the explicit formulae become exceedingly complicated, even for relatively
simple functions F (t, u). Therefore, fast evaluation of the terms in the Taylor
approximation becomes a significant issue.
We will abandon the Taylor series approach, and look elsewhere for high order, efficient
integration methods.
uk+1 = uk + h F (tk , uk ) +

Local Truncation Error


Before continuing our investigations, we need to engage in a more serious discussion of
the error in a numerical scheme. The Euler method is the simplest example of a one-step
numerical method for integrating an ordinary differential equation, which refers to the fact
that the value for the succeeding approximation, uk+1 u(tk+1 ), depends only upon the
current value, uk u(tk ), which is one mesh point (step) in back. A general one-step
numerical method can be written in the form
uk+1 = G(h, tk , uk ),

(19.83)

where G is a prescribed function of the current value uk , the point tk itself, and the step
size h = tk+1 tk , which, for illustrative purposes, we assume to be fixed. We leave the
discussion of multi-step methods, in which G could also depend upon the earlier values
uk1 , uk2 , . . . , to more advanced texts, e.g., [node].
In order to understand how closely such a numerical method tracks the solution, we
need to analyze the error in the approximation. In any numerical integration scheme there
are, in general, three sources of error.
3/7/03

868

c 2003

Peter J. Olver

(i )

The first is the local truncation error committed in the current step of the algorithm.
Even if we had managed to compute a completely accurate value of the solution
uk = u(tk ) at time tk , the numerical approximation scheme (19.83) is not exact,
and will therefore introduce an error into the next computed value uk+1 u(tk+1 ).
(ii ) The second source of error is due to the error that is already present in the current
approximation uk u(tk ). The local errors tend to accumulate as we continue to
integrate the differential equation, and the net result is the global truncation error
in the scheme.
(iii ) Finally, if the initial condition u0 u(t0 ) is not computed accurately, this initial
error will also make a contribution. For example, if u(t0 ) = , then we introduce
some initial error by using a decimal approximation, say 3.14.
The third error source is relatively unimportant, and will be ignored in our discussion,
i.e., we will assume u0 = u(t0 ) is exact. Then the global error will be the accumulation of
successive local errors, and so we must first understand the local error in detail in order
to ascertain the accuracy of a proposed numerical scheme.
More concretely, given a numerical scheme (19.83), the local truncation error is, by
definition, the difference
uk+1 u(tk+1 ) = G(h, tk , u(tk )) u(tk + h),

(19.84)

where u(t) is the exact solution to the differential equation with initial condition u(t k ) = uk .
As advertised, the local truncation error assumes that we have computed the solution
accurately at time step tk , and measures only the error introduced at tk+1 . Thus, in the
case of Eulers method (19.70), we have
uk+1 = G(h, tk , uk ) = uk + h F (tk , uk ),
and hence its local truncation error is given by
uk + h F (tk , uk ) u(tk + h),

provided

u(tk ) = uk

and

u = F (t, u).

(19.85)

To estimate the local truncation error, we assume that the step size h is small and
approximate the solution u(t) by its Taylor expansion
du
h 2 d2 u
h 3 d3 u
(tk ) +
(t
)
+
(t ) +
dt
2 dt2 k
6 dt3 k
h2 (2)
h3 (3)
= uk + h F (tk , uk ) +
F (tk , uk ) +
F (tk , uk ) + ,
2
6

u(tk+1 ) = u(tk ) + h

(19.86)

where we replace u(tk ) by uk , and use (19.77), (19.81), etc., to evaluate the derivative
terms. On the other hand, a direct Taylor expansion, in h, of the numerical scheme
produces
uk+1 = G(h, tk , uk ) = G(0, tk , uk ) + h
3/7/03

G
h2 2 G
(0, tk , uk ) +
(0, tk , uk ) + . (19.87)
h
2 h2
869

c 2003

Peter J. Olver

Thus, for the Euler method (19.70), uk+1 = uk + h F (tk , uk ) agrees with the zeroth and
first order terms in the Taylor expansion, and hence its local truncation error equals
uk + h F (tk , uk ) u(tk + h) =

h2 (2)
h3 (3)
F (tk , uk )
F (tk , uk ) .
2
6

(19.88)

Definition 19.37. A numerical integration method is of order n if the local truncation error is bounded by a multiple of hn+1 , so that
| G(h, tk , u(tk )) u(tk + h) | hn+1 M (h, tk , uk )

(19.89)

for some bounded function M (h, tk , uk ) and all h sufficiently small.


In practice, one does not need to find the explicit bound (19.89). The practical rule
of thumb is that the order of the method is one less than the power of h appearing in the
first nonzero term of the Taylor expansion:
u(tk + h) G(h, tk , u(tk )) = hn+1 H(tk , uk ) + .

(19.90)

Thus, in the case of the Euler method, the first term Taylor expansion of the local truncation error (19.88) is of order h2 , and hence the Euler method is first order. Similarly,
the Taylor method (19.78) is a second order method, because it was explicitly designed to
match the constant, h and h2 terms in the Taylor expansion of the solution (19.86). Thus,
the first terms where the expansions (19.86), (19.87) disagree involve h 3 , and so the method
is of second order. For a general Taylor method of order n, one chooses G(h, t k , uk ) to be
exactly the order n Taylor polynomial, and hence the local truncation error is bounded by
a multiple of hn+1 .
Under very general hypotheses, it can be proved that if the local truncation error has
order hn+1 , as in (19.89), then the global error is bounded by a multiple of h n . In other
words, if the initial condition u0 = u(t0 ) is accurate, then the computed value uk differs
from the solution at time tk by an amount
| uk u(tk ) | hn Q(h, tk , uk )

(19.91)

where Q(h, tk , uk ) is bounded. Thus, a numerical scheme of order n has local error bounded
by a multiple of hn+1 , and global error bounded by a multiple of hn , which explains the
1
, then the
choice of terminology. In particular, if the step size decreases by a factor of 10
n
error in the solution decreases by a factor of at least 10 , and so, roughly, we expect to
pick up an additional n digits of accuracy in the solution value at least up until the
point that round-off error begin to play a role in the computation. These observations are
very rough, and need to be taken with a grain of salt; nevertheless, they are borne out in
almost all of our test examples. Readers interested in a complete error analysis of numerical
integration schemes should consult a more specialized text, e.g., [numODE, node].
The bottom line is the higher its order, the more accurate the numerical scheme,
and hence the larger the step size that can be used to produce the solution to a desired
accuracy. If the total amount of computation has also decreased, then the method is to be
preferred over a simpler, lower order method. Our goal now is to find another route to the
design of higher order methods that avoids the complications inherent in a direct Taylor
3/7/03

870

c 2003

Peter J. Olver

expansion. More specifically, we are in need of suitably compact combinations of function


values that reproduce the Taylor expansion of the solution (19.86) to high order.
An Equivalent Integral Equation
The secret underlying the design of practical higher order numerical methods is to
replace the differential equation by an equivalent integral equation. By way of motivation,
recall that, in general, differentiation is a badly behaved process; a reasonable function
can have an unreasonable derivative. On the other hand, integration ameliorates; even
quite nasty functions have relatively well-behaved integrals. For the same reason, accurate
numerical integration is relatively easy, whereas numerical differentiation should be avoided
if possible. While we have not dealt directly with integral equations in this text, the subject
has been extensively developed, [31], and has many important applications.
The conversion of the initial value problem (19.65) to an integral equation is straightforward. We integrate both sides of the differential equation from the initial point t 0 to a
variable time t. The Fundamental Theorem of Calculus is used to explicitly evaluate the
integral:
Z
Z
t

u(t) u(t0 ) =

u(s) ds =

t0

F (s, u(s)) ds.

t0

Rearranging terms, we arrive at the key result.


Theorem 19.38.
initial value problem

There is a one-to-one correspondence between solutions to the

du
= F (t, u),
dt
and solutions to the integral equation
u(t) = u(t0 ) +

u(t0 ) = u0 ,
Z

F (s, u(s)) ds.

(19.92)

t0

Proof : We already showed that the solution u(t) to the initial value problem satisfies
the integral equation (19.92). Conversely, if u(t) solves the integral equation, then the
Fundamental Theorem of Calculus tells us that u(t) is differentiable, and has derivative
du/dt = F (t, u(t)) equal to the integrand. Moreover, at t = t0 , the integral has the same
upper and lower limits, and so vanishes. This implies that u(t) = u(t 0 ) = u0 has the
correct initial conditions.
Q.E.D.
Remark : Unlike the differential equation, the integral equation (19.92) requires no
additional initial condition it is automatically built into the equation. The proofs of the
fundamental existence and uniqueness Theorems 19.8 and 19.10 for ordinary differential
equations are, in fact, based on the integral reformulation of the initial value problem; see
[59, 65] for details.
Implicit and PredictorCorrector Methods
From this point onwards, we shall abandon the original initial value problem, and
concentrate on trying to numerically solve the integral equation (19.92). Let us rewrite
3/7/03

871

c 2003

Peter J. Olver

the equation, starting at the mesh point tk instead of t0 , and integrating until time t = tk+1 .
The result is the basic integral formula
Z tk+1
u(tk+1 ) = u(tk ) +
F (s, u(s)) ds
(19.93)
tk

that (implicitly) computes the value of the solution at the subsequent mesh point. Comparing this formula with the Euler method
uk+1 = uk + h F (tk , uk ),

where

h = tk+1 tk ,

and assuming for the moment that uk = u(tk ) is exact, we discover that we are merely
approximating the integral by
Z tk+1
F (s, u(s)) ds h F (tk , u(tk )).
(19.94)
tk

Formula (19.94) is the left-endpoint rule for numerical integration, that approximates the
area under the curve g(t) = F (t, u(t)) between tk t tk+1 by the area of a rectangle
whose height g(tk ) = F (tk , u(tk )) F (tk , uk ) is prescribed by the left-hand endpoint of
the graph. As indicated in Figure lhi , this is a reasonable, but not especially accurate
method of numerical integration.
In first year calculus, you no doubt encountered much better methods of approximating
the integral of a function. One of these is the trapezoid rule, which approximates the
integral of the function g(t) by the area of a trapezoid obtained by connecting the two
endpoints g(tk ) and g(tk+1 ) by a straight line, as in Figure trap . Let us therefore try
replacing (19.94) by the more accurate trapezoidal approximation
Z tk+1

F (s, u(s)) ds 21 h F (tk , u(tk )) + F (tk+1 , u(tk+1 )) .


(19.95)
tk

Substituting this approximation into the integral formula (19.93), and replacing u(t k ), u(tk+1 )
by their numerical approximations, leads to the (hopefully) more accurate numerical
scheme

(19.96)
uk+1 = uk + 21 h F (tk , uk ) + F (tk+1 , uk+1 ) .

The resulting trapezoid method is an implicit scheme, since the updated value u k+1 appears
on both sides of the equation, and hence is defined implicitly.

Example 19.39. Consider the differential equation u = (1 2 t) u studied in Examples 19.35 and 19.36. The trapezoid rule with a fixed step size h = t k+1 tk , takes the
form

h
uk+1 = uk +
(1 2 tk ) uk + (1 2 tk+1 ) uk+1 .
2
Solving for the updated value, we obtain
uk+1 =

1 + 12 h (1 2 tk )
1 + 12 h h tk
u
=
u .
1 12 h (1 2 tk+1 ) k
1 12 h + h (tk + h) k

(19.97)

Implementing this scheme for three different step sizes gives the following errors between
the computed solution and true solution at times t = 1, 2, 3.
3/7/03

872

c 2003

Peter J. Olver

e(1)

e(2)

e(3)

0.100
0.010
0.001

0.0000000000
0.0000000000
0.0000000000

0.0002279134
0.0000022558
0.0000000226

0.0000871062
0.0000008676
0.0000000087

Surprisingly, at t = 1 the error is less than round-off error! Note the gain of two significant
figures with each reduction in step size, confirming that this is a second order method.
The main problem with the trapezoid scheme (and any other implicit scheme) is
immediately apparent. The updated approximate value for the solution u k+1 appears on
both sides of the equation (19.96). Only for very simple functions F (t, u) can one expect to
solve (19.96) explicitly for uk+1 in terms of the known quantities tk , uk and tk+1 = tk + h.
The alternative is to employ a numerical equation solver such as the bisection algorithm
or Newtons method to compute uk+1 . In the case of Newtons method, one would use the
current approximation uk as a first guess for the new approximation uk+1 similar to
the continuation method discussed in Example 18.27. The resulting scheme takes a little
bit of work to program, but can be effective in certain situations.
An alternative, less complicated strategy is the following. We already know a halfway decent approximation to the solution value uk+1 namely that provided by the more
primitive Euler scheme
u
ek+1 = uk + h F (tk , uk ).
(19.98)
Lets use this estimated value in place of uk+1 on the right hand side of the implicit
equation (19.96). The result is the Heun or improved Euler method

uk+1 = uk + 12 h F (tk , uk ) + F (tk + h, u


ek+1 )

(19.99)
= uk + 12 h F (tk , uk ) + F tk + h, uk + h F (tk , uk ) .
which is a completely explicit method.

Example 19.40. For our favorite equation u = (1 2 t) u, the Heun method takes
the form
u
ek+1 = uk + h (1 2 tk ) uk ,

h
(1 2 tk ) uk + (1 2 tk+1 ) u
ek+1
uk+1 = uk +
2

h
(1 2 tk ) uk + 1 2 (tk + h) uk + h (1 2 tk ) uk .
= uk +
2

(19.100)

Implementing this scheme leads to the following errors at the indicated times. As in the
1
implicit method , (19.97), the accuracy increases by, roughly, a factor of 100
for each
1
reduction in step size by 10
. As might have been predicted, the Heun method performs a
little worse than the fully implicit scheme, but much better than the original Euler method.
3/7/03

873

c 2003

Peter J. Olver

e(1)

e(2)

e(3)

0.100
0.010
0.001

0.0004765492
0.0000004750
0.0000000005

0.0025756696
0.0000228375
0.0000002258

0.0004679764
0.0000033192
0.0000000323

The Heun method is the simplest of a large family of predictorcorrector methods.


One begins a relatively crude method in this case the Euler method to predict a
first approximation u
ek+1 to the desired solution value uk+1 . One then employs a more
sophisticated, typically implicit, method to correct the original prediction, but replacing
the required update uk+1 on the right hand side of the implicit scheme by the less accurate
prediction u
ek+1 . The resulting explicit, corrected value uk+1 will, provided the method
has been properly designed, result in a much better approximation to the true solution.
To determine the order of accuracy for the Heun method, we compute its local truncation error. To this end, we expand the right hand side of (19.99) in a Taylor series in h,
and then compare, term by term, with the solution expansion (19.86). First,

F tk + h, uk + h F (tk , uk ) = F + h Ft + F Fu + 12 h2 Ftt + 2 F Ftu + F 2 Fuu + ,

where all the terms on the right hand side are evaluated at tk , uk . Substituting into (19.99),
we find

uk+1 = uk + h F + 12 h2 Ft + F Fu + 41 h2 Ftt + 2 F Ftu + F 2 Fuu + . (19.101)


The Taylor expansions (19.86), (19.101) agree in their order 1, h and h 2 terms, but differ
at order h3 . Thus, the local truncation error has order h3 , and we conclude that the Heun
method is of second order, reconfirming our observations in the Example 19.40.
Let us now return to the basic integral equation (19.93). The midpoint rule is an
alternative for approximating the integral, with the same order of accuracy as the trapezoid
rule. Here, to approximate the integral of the function g(t) = F (t, u(t)) over the interval
[ tk , tk+1 ] one uses
the
area1 of the rectangle whose height is the value of the function
1
g 2 (tk + tk+1 ) = g tk + 2 h at the midpoint, as illustrated in Figure mid . This leads
to the approximation
Z tk+1

(19.102)
F (s, u(s)) ds h F tk + 21 h, u tk + 21 h .
tk

In order to employ this approximation, we must predict the value of the solution at the
midpoint tk + 12 h. As before, this will be done through a straightforward adaptation of
the basic Euler approximation:

u tk + 21 h uk + 12 h F (tk , uk ).
(19.103)
The result is the midpoint method

3/7/03

uk+1 = uk + h F tk +
874

1
2

h, uk +

1
2

h F (tk , uk ) .

c 2003

(19.104)
Peter J. Olver

A similar Taylor expansion in h and comparison of terms reveals that the midpoint method
also has local truncation error of order h3 , and hence defines a second order method.
Example 19.41. For our usual initial value problem (19.17), the midpoint rule takes
the form


uk+1 = uk + 1 2 tk + 21 h
(19.105)
uk + 21 h (1 2 tk ) uk .

Implementation results in the following errors at the indicated times. Again, these confirm
that the method is second order, and the results are marginally better than the Heun
method.
h

e(1)

e(2)

e(3)

0.100
0.010
0.001

0.0001412334
0.0000001417
0.0000000001

0.0010032111
0.0000091199
0.0000000903

0.0003512828
0.0000025520
0.0000000249

RungeKutta Methods
The improved Euler and midpoint methods are the most elementary incarnations of
a general class of numerical methods that were first systematically studied by the German
mathematicians Carle Runge and Martin Kutta in the nineteenth century. The Runge
Kutta methods are by far the most popular and powerful general-purpose numerical methods for integrating ordinary differential equations. While not appropriate in all possible
situations, RungeKutta schemes are surprisingly adaptable and perform quite efficiently
and accurately in a wide variety of settings. Most computer software that is designed to
solve general initial value problems for systems of ordinary differential equations are based
on a RungeKutta scheme.
A general RungeKutta method takes the form
uk+1 = uk + h

m
X

ci F (ti,k , ui,k ).

(19.106)

i=1

We call m the number of terms in the method. Each ti,k = tk + i h denotes a point lying
between the mesh points tk and tk+1 , so 0 i 1, while ui,k u(ti,k ) should be viewed as
an approximation to the solution at ti,k , and is computed by a similar, but simpler method
of the same general form. The number of required function values F (t j , uj ), including
those used to compute the intermediate points ui,k , is known as the number of steps in
the RungeKutta scheme (19.106). The more steps, the higher the anticipated order of
the method. One is free to choose the coefficients ci , the times ti,k and the intermediate
approximations ui,k , and the goal is to arrange that the method have a desired order of
accuracy, while no becoming unduly complicated.
3/7/03

875

c 2003

Peter J. Olver

Both the Heun and midpoint methods are particular cases of a class of two term
RungeKutta methods of the form

uk+1 = uk + h a F (tk , uk ) + b F tk + h, uk + h F (tk , uk ) ,


0 1,
(19.107)
tk,2 = tk and uk,1 = uk are the current values, while uk,2 = uk + h F (tk , uk ) represents
the Euler approximation to the solution at the intermediate time tk,2 = tk + h. The
values of a, b and are to be determined by matching the Taylor expansion

uk+1 = uk + h a F (tk , uk ) + b F tk + h, uk + h F (tk , uk )

F
F
2
= uk + h (a + b) F (tk , uk ) + h b
(t , u ) + F (tk , uk )
(t , u ) + .
t k k
u k k
(in powers of h) of the right hand side of (19.107) with the Taylor expansion (19.86) of the
actual solution u(tk+1 ) = u(tk + h) to as high an order as possible. For the order h and
order h2 terms to agree, we must have, respectively,
a + b = 1,

b = 21 .

Therefore, setting a = 1 b, and = (2 b)1 , where b is arbitrary, leads to the family of


two term, second order RungeKutta methods of the form

h
h
,u +
F (tk , uk )
.
(19.108)
uk+1 = uk + h (1 b) F (tk , uk ) + b F tk +
2b k 2b
The case b = 12 corresponds to the Heun method (19.99), while b = 1 gives the midpoint
method (19.104). Unfortunately, none of these methods are able to match all the third
order terms in the Taylor expansion, and so we are left with a one-parameter family of two
step RungeKutta methods, all of second order, that include the Heun and midpoint rules
as particular instances. The cases when 21 b 1 all perform more or less comparably,
and there is no special reason to prefer one over the other.
To construct a third order RungeKutta method, we need to take at least three terms,
m 3. A rather complicated symbolic computation will produce the range of possibilities,
and the results can be found in [nODE]. Finding relatively simple, but high order Runge
Kutta methods is a rather tedious process, and we leave a complete discussion of the
available options to a more advanced treatment. In practical applications, a particularly
simple fourth order, four term method has become the most popular. The method, often
abbreviated as RK4, takes the form
uk+1 = uk +

h
Fk + 2 G k + 2 H k + I k ,
6

(19.109)

where the function values Fk , Gk , Hk , Ik , are successively computed according to the


following procedure:

Fk = F (tk , uk ),
Hk = F tk + 21 h, uk + 21 h Gk ,

(19.110)
Gk = F tk + 12 h, uk + 21 h Fk ,
Ik = F ( tk + h, uk + h Hk ) .
3/7/03

876

c 2003

Peter J. Olver

Each of these quantities is formed from an Euler-like approximation: the first at t k ; the
second and third at the midpoint tk + 12 h, while the fourth is at the right hand end tk+1 =
tk + h. The final combination 16 (Fk + 2 Gk + 2 Hk + Ik ) affords the best approximation of
them all.
The four term RK4 scheme (19.109), (19.110) is, in fact, a fourth order method. This
is confirmed by demonstrating that the Taylor series expansion of the right hand side of
(19.109) in powers of h matches all of the terms in the true Taylor series (19.86) up to and
including those of order h4 , and hence the local truncation error is of order h5 . This is not
a computation for the faint-hearted! The RK4 scheme is a particular member of a large
family of possible fourth order, four term RungeKutta methods, but is by far the most
popular owing to its relative simplicity.

Example 19.42. For our favorite equation u = (1 2 t) u, the RK4 method leads to
the following errors at the indicated times.
h

e(1)

e(2)

e(3)

0.100
0.010
0.001

1.287 107
1.289 1012
1.110 1016

8.112 106
6.963 1010
3.872 1014

3.818 106
2.785 1010
2.536 1014

The results are phenomenally good and without significant additional computation
1
over the previous schemes. Each decrease in the step size by a factor of 10
leads to 4 more
decimal digits of accuracy, in accordance with it being a fourth order method.
Actually, since it involves four evaluations of the function F , we should be comparing
RK4 to Euler at step size h/4, or Heun at step size h/2, as these involve roughly the same
amount of computational effort. Again, RK4 gives a much more accurate representation
of the solution, which explains its popularity for a broad range of applications.
Example 19.43. As noted above, by replacing the function values uk by vectors
uk , one can immediately apply the RK4 method to integrate initial value problems for
first order systems of ordinary differential equations. Consider the second order ordinary
differential equation planar system pendulum??? Lotka volterra. Monitor first integral.
1. For a surprise, try integrating our favorite initial value problem up to time t = 50
using a step size of h = .1. Can you explain what happens? If not, try this problem again
after reaching the end of the chapter.
In practical implementations, one needs to know when the numerical solution is no
longer accurate. If inaccuracies are detected at a certain stage in the computation, the
principal fix is to back up a bit, and reapply the method with a smaller step size. How
might one decide when a method is giving inaccurate results, since one presumably does no
know the solution and has nothing to directly test the numerical approximation against?
A useful idea is to integrate the differential equation using two different methods, and
comparing the results. If they are reasonably close, then one is usually safe in assuming
3/7/03

877

c 2003

Peter J. Olver

that the methods are both giving accurate results, while in the event that they differ beyond
some preassigned tolerance, then one needs to re-evaluate the step size. Determination of
an appropriate new step size is facilitated by choosing the two methods to have different
orders of accuracy. Details can be found in more advanced treatments of the subject, e.g.,
[nODE].
Stiff Differential Equations
While the RungeKutta fourth order method with a sufficiently small step size will
successively integrate a broad range of differential equations at least over not unduly
long time intervals it does occasionally experience unexpected difficulties. While we have
not developed sufficiently sophisticated analytical tools to conduct a thorough analysis, it
will help to look at why a breakdown might occur in a simpler context.
Example 19.44. To give the complacent reader a reality check, let us consider the
simple linear, scalar initial value problem
du
= 250 u,
dt

(19.111)

u(0) = 1.

The solution is easy


u(t) = e 250 t

with

u(1) 2.69 10109 .

The following table gives the result of approximating the solution u(1) at t = 1 using three
of our numerical integration schemes for several step sizes:
h

Euler

Trapezoid

RK4

.1
.01
.001

6.34 1013
4.07 1017
1.15 10 125

3.99 1024
1.22 1021
6.17 10 108

2.81 1041
1.53 10 19
2.69 10 109

The results are not misprints! When the step size is .1, not only are the values nowhere
close, they are perplexingly large, and appear to represent an exponentially growing solution the complete opposite of the rapidly decaying true solution. Reducing the step
size beyond a critical threshold suddenly transforms the numerical solution to an exponentially decaying function. Only the fourth order RK4 method with step size h = .001
and hence 1, 000 steps does a reasonable job at approximating the correct value of
the solution at t = 1.
The reader may well ask what is going on? The solution couldnt be simpler why is
it so difficult to compute it? To illustrate the basic issue, let us analyze how the simplest
Euler method handles such differential equations. Our goal is to solve an elementary initial
value problem of the form
du
(19.112)
= u,
u(0) = 1,
dt
3/7/03

878

c 2003

Peter J. Olver

with true solution


u(t) = e t .
As in Example 19.34, for this equation, the Euler method with step size h relies on the
iterative scheme
uk+1 = (1 + h) uk ,
u0 = 1,
with solution
uk = (1 + h)k .

(19.113)

If > 0, the exact solution, e t , is exponentially growing. Since 1 + h > 1, the numerical
iterates are also growing, albeit at a somewhat slower rate. In this case, there is no
inherent surprise with the numerical approximation procedure in the short run its
gives reasonably accurate results, but eventually lags behind the growth rate of the true
solution. On the other hand, if < 0, then the exact solution e t is exponentially decaying.
But if h < 2, then 1 + h < 1, and the iterates (19.113) grow exponentially fast
in magnitude, with alternating signs. In this case, the numerical solution is nowhere
close to the true solution, which explains the previous pathological behavior. In order to
correctly model the qualitative features of the solution and obtain a numerically respectable
approximation, we need to choose the step size h so as to ensure that h < 1/ when
1
= .004. In
< 0. For the given value = 250, then, we need to choose h < 250
consequence, the larger negative is and hence the faster the solution tends to zero
the more difficult the numerical integration owing to the extremely small required step
size, even to the point of exhausting any available computing power. The solution methods
for ordinary differential equations exhibit features similar to the conditional stability we
encountered in our numerical solution to the heat and wave equations in Section 13.5.
The ordinary differential equation (19.111) is the simplest example of what is known
as a stiff differential equation. In general, an equation or system is stiff if it has one or

more very rapidly decaying solutions. In the case of linear systems u = A u, stiffness
occurs whenever the coefficient matrix A has an eigenvalue with large negative real part:
Re 0. It only takes one such solution to render the equation stiff, and ruin the
numerical computation of even the well behaved solutions! Curiously, the component of
the actual solution corresponding to such large negative eigenvalues is almost irrelevant,
as it becomes almost instanteously negligible; however, the presence of such an eigenvalue
continues to render the numerical solution to the system very difficult. Stiff equations
require more sophisticated numerical procedures to integrate, and we refer the reader
to [numODE, HairerWanner2] for details. In fact, some stiff equations are well-nigh
impossible to solve!

3/7/03

879

c 2003

Peter J. Olver

Chapter 20
The Calculus of Variations
We have already had ample encounters with Natures propensity to optimize. Minimization principles form one of the most powerful tools for formulating mathematical
models governing the equilibrium configurations of physical systems. Moreover, the design of numerical integration schemes such as the powerful finite element method are also
founded upon a minimization paradigm. This chapter is devoted to the mathematical
analysis of minimization principles on infinite-dimensional function spaces a subject
known as the calculus of variations, for reasons that will be explained as soon as we
present the basic ideas. Solutions to minimization problems in the calculus of variations
lead to boundary value problems for ordinary and partial differential equations. Numerical
solutions are primarily based upon a nonlinear version of the finite element method. The
methods developed tom handle such problems are of fundamental importance in many
areas of mathematics, physics, and engineering.
The history of the calculus of variations is tightly interwoven with the history of calculus, and has merited the attention of a remarkable range of mathematicians, beginning with
Newton, then developed as a field of mathematics in its own right by the Bernouilli family.
The first major developments appeared in the work of Euler, Lagrange and Laplace. In
the nineteenth century, Hamilton, Dirichlet and Hilbert are but a few of the outstanding
contributors. In modern times, the calculus of variations has continued to occupy center
stage in research, including major theoretical advances, along with wide-ranging applications in physics, engineering and all branches of mathematics. In this chapter, we will only
have time to scratch the surface of the vast area of classical and contemporary research.
Minimization problems amenable to the methods of the calculus of variations serve to
characterize the equilibrium configurations of almost all continuous physical systems, ranging through elasticity, solid and fluid mechanics, electro-magnetism, gravitation, quantum
mechanics, and many, many others. Many geometrical systems, such as minimal surfaces,
can be conveniently formulated as optization problems. Moreover, numerical approximations to the equilibrium solutions of such boundary value problems are based on a nonlinear
finite element approach that reduced the infinite-dimensional minimization problem to a
finite-dimensional problem, to which we can apply the optimization techniques learned in
Section 18.3.
We have already treated the simplest problems in the calculus of variations. As we
learned in Chapters 10 and 14, minimization of a quadratic functional requires solving an
associated boundary value problem for a linear differential equation. Just as the vanishing
of the gradient of a function of several variables singles out the critical points, among which
are the minima, both local and global, so a similar functional gradient will distinguish
3/7/03

881

c 2003

Peter J. Olver

the candidate functions that might be minimizers of the functional. The finite-dimensional
gradient leads to a system of algebraic equations; the functional gradient leads to a boundary value problem for a nonlinear ordinary or partial differential equation. Thus, the
passage from finite to infinite dimensional nonlinear systems mirrors the transition from
linear algebraic systems to boundary value problems.

20.1. Examples of Variational Problems.


The best way to introduce the subject is to introduce some concrete examples of both
mathematical and practical importance. These particular minimization problems played
a key role in the historical development of the calculus of variations. And they still serve
as an excellent motivation for learning its basic constructions.
Minimal Curves and Geodesics
The minimal curve problem is to find the shortest path connecting two points. In its
simplest manifestation, we are given two distinct points
a = (a, )

and

b = (b, )

in the plane

R 2.

(20.1)

Our goal is to find the curve of shortest length connecting them. Obviously, as you learn
in childhood, the shortest path between two points is a straight line; see Figure sline .
Mathematically, then, the minimizing curve we are after should be given as the graph of
the particular affine function
y = cx + d =

(x a) +
ba

(20.2)

passing through the two points. However, this commonly accepted fact that (20.2)
is the solution to the minimization problem is, upon closer inspection, perhaps not so
immediately obvious from a rigorous mathematical standpoint.
Let us see how we might properly formulate the minimal curve problem. Let us
assume that the minimal curve is given as the graph of a smooth function y = u(x). Then,
according to (A.27), the length of the curve is given by the standard arc length integral
Z bp
J[ u ] =
1 + (u0 )2 dx,
(20.3)
a

where we abbreviate u = du/dx. The function is required to satisfy the boundary conditions
u(a) = ,
u(b) = ,
(20.4)
in order that its graph pass through the two prescribed points (20.1). The minimal curve
problem requires us to find the function y = u(x) that minimizes the arc length functional
(20.3) among all reasonable functions satisfying the prescribed boundary conditions. The
student should pause to reflect on whether it is mathematically obvious that the affine

We assume that a 6= b, i.e., the points a, b do not lie on a common vertical line.

3/7/03

882

c 2003

Peter J. Olver

function (20.2) is the one that minimizes the arc length integral (20.3) subject to the given
boundary conditions. One of the motivating tasks of the calculus of variations, then, is to
rigorously prove that our childhood intuition is indeed correct.
Indeed, the word reasonable is important. For the arc length functional to be
defined, the function u(x) should be at least piecewise C1 , i.e., continuous with a piecewise
continuous derivative. If we allow discontinuous functions, then the straight line (20.2)
does not, in most cases, give the minimizer; see Exercise . Moreover, continuous functions
which are not piecewise C1 may not have a well-defined length. The more seriously one
thinks about these issues, the less evident the solution becomes. But, rest assured that the
obvious solution (20.2) does indeed turn out to be the true minimizer. However, a fully
rigorous mathematical proof of this fact requires a proper development of the calculus of
variations machinery.
A closely related problem arises in optics. The general principle, first formulated by
the seventeenth century French mathematician Pierre de Fermat, is that when a light ray
moves through an optical medium, e.g., a vacuum, it travels along a path that will minimize
the travel time. As always, Nature seeks the most economical solution! Let c(x, y) denote
the speed of light at each point in the medium . Speed is equal to the time derivative of
distance travelled, namely, the arc length (20.3) of the curve y = u(x) traced by the light
ray. Thus,
ds p
dx
= 1 + u0 (x)2
.
c(x, u(x)) =
dt
dt

Integrating from start to finish, we conclude that the total travel time of the light ray is
equal to
Z T
Z b
Z bp
1 + u0 (x)2
dt
T[u] =
dt =
dx =
dx.
(20.5)
c(x, u(x))
0
a dx
a

Fermats Principle states that, to get from one point to another, the light ray follows
the curve y = u(x) that minimizes this functional. If the medium is homogeneous, then
c(x, y) c is constant, and T [ u ] equals a multiple of the arc length functional, whose
mimizers are the obvious straight lines. In an inhomogeneous medium, the path taken by
the light ray is no longer evident, and we are in need of a systematic method for solving the
minimization problem. All of the known laws of optics and lens design, governing focussing,
refraction, etc., all follow as consequences of the minimization principle, [optics].
Another problem of a similar ilk is to construct the geodesics on a curved surface,
meaning the curves of minimal length. In other words, given two points a, b on a surface
S R 3 , we seek the curve C S that joins them and has the minimal possible length.
For example, if S is a circular cylinder, then the geodesic curves turn out to be straight
lines parallel to the center line, circles orthogonal to the center line, and spiral helices;
see Figure geocyl for an illustration. Similarly, the geodesics on a sphere are arcs of
great circles; these include the circumpolar paths followed by airplanes around the globe.
However, both of these claims are in need of rigorous justification.

For simplicity, we only consider the two-dimensional case here.

3/7/03

883

c 2003

Peter J. Olver

In order to mathematically formulate the geodesic problem, we suppose, for simplicity,


that our surface S R 3 is realized as the graph of a function z = F (x, y). We seek the
geodesic curve C S that joins the given points
a = (a, , F (a, )),

and

b = (b, , F (b, )),

on the surface

S.

Let us assume that C can be parametrized by the x coordinate, in the form


y = u(x),

z = F (x, u(x)).

In particular, this requires a 6= b. The length of the curve is given by the usual arc length
integral (B.17), and so we must minimize the functional
2 2
Z bs
dy
dz
J[ u ] =
1+
+
dx
dx
dx
a
2

Z bs
F
F
du 2
du
+
dx,
(x, u(x)) +
(x, u(x))
=
1+
dx
x
y
dx
a
subject to the boundary conditions
u(a) = ,

u(b) = .

For example, the geodesics on the paraboloid


z=

1
2

x2 +

1
2

y2

(20.6)

can be found by minimizing the functional


Z bp
J[ u ] =
1 + (u0 )2 + (x + u u0 )2 dx

(20.7)

subject to prescribed boundary conditions.


Minimal Surfaces

The minimal surface problem is a natural generalization of the minimal curve problem.
In its simplest manifestation, we are given a simple closed curve C R 3 . The problem is
to find the surface S of least total area among all those whose boundary S = C coincides
with the given curve. Therefore, we seek to minimize the surface area integral
ZZ
area S =
dS
S

over all possible surfaces S R 3 with the prescribed boundary curve S = C. Such an
areaminimizing surface is known as a minimal surface for short.

Cylinders are not graphs, but can be placed within this framework by passing to cylindrical
coordinates. Similarly, spherical surfaces are best treated in spherical coordinates.

3/7/03

884

c 2003

Peter J. Olver

Physically, if we take a wire in the shape of the curve C and dip it into soapy water,
then the surface tension forces in the resulting soap film will force it to minimize surface
area, and hence be a minimal surface . For example, if the curve is a closed plane curve,
e.g., a circle, then the minimal surface will just be the planar region enclosed by the curve.
But, if the curve C twists into the third dimension, then the shape of the minimizer is
by no means evident. Soap films and bubbles have been the source of much fascination,
physical, sthetical and mathematical, over the centuries. The least area problem is
also known as Plateaus Problem, after the nineteenth century Freanch physicist Joseph
Plateau, who conducted systematic experiments. A satisfactory solution to the simplest
version of the minimal surface problem was only achieved in the mid twentieth century,
[91, 93]. Problems arising in engineering design, architecture, and biology, such as foams,
membranes and drug delivery methods, make this problem of continued contemporary
importance and an active area of research.
Let us mathematically formulate the search for a minimal surface as a problem in the
calculus of variations. For simplicity, we shall assume that the bounding curve C projects
down to a simple closed curve = that bounds an open domain R 2 in the (x, y)
plane, as in Figure minsurf . The space curve C R 3 is then given by z = g(x, y)
for (x, y) . For reasonable curves C, we expect that the minimal surface S will be
described as the graph of a function z = u(x, y) parametrized by (x, y) . The surface
area of such a graph is given by the double integral
s
2 2
ZZ
u
u
J[ u ] =
1+
+
dx dy;
(20.8)
x
y

see (B.39). To find the minimal surface, then, we seek the function z = u(x, y) that minimizes the surface area integral (20.8) when subject to the Dirichlet boundary conditions
u(x, y) = g(x, y)

for

(x, y)

(20.9)

that prescribe the boundary curve C. As we shall see, the solutions to this minimization
problem satisfy a certain nonlinear second order partial differential equation, given in
(20.49) below.
A simple version of the minimal surface problem, that still contains many interesting
features, is to find minimal surfaces of revolution. Recall that a surface of revolution is
obtained by revolving a plane curve about and axis, which, for definiteness, we take to be
the x axis. Thus, given two points a = (a, ), b = (b, ) R 2 , our goal is to find the curve
y = u(x) joining them such that the surface of revolution obtained by revolving the curve
around the x-axis has the least surface area. According to Exercise , the area of such a
surface of revolution is given by
Z b
p
J[ u ] =
2 | u | 1 + (u0 )2 dx.
(20.10)
a

More correctly, the soap film will realize a local but not necessarily global minimum for the
surface area functional.

3/7/03

885

c 2003

Peter J. Olver

We seek a minimizer of this integral among all functions u(x) that satisfy the boundary
conditions
u(a) = ,
u(b) = .
The minimal surface of revolution can be physically realized by stretching a soap film
between two wire circles, of radius and , placed a distance b a apart. Symmetry
considerations will require the minimizing surface to be rotationally symmetric. Interestingly, the revolutionary surface area functional (20.10) is exactly the same as the optical
functional (20.5) when the light speed at a point
is inversely
proportional to its distance

from the horizontal axis, namely c(x, y) = 1/ 2 | y | .

20.2. The Simplest Variational Problem.

Even the preceding, rather limited collection of examples of variational problems


should already convince the reader of the practical utility of the calculus of variations.
Let us now discuss the most basic analytical techniques for solving such minimization
problems. We will exclusively deal with the classical approach, leaving more modern direct methods the function space equivalent of the gradient descent method to a more
indepth treatment of the subject, rfcvar.
Let us concentrate on the simplest class of variational problems, in which the unknown
is a continuously differentiable scalar function, and the functional to be minimized depends
upon at most its first derivative. The basic minimization problem, then, is to determine
the function y = u(x) C1 [ a, b ] that minimizes the objective functional
Z b
J[ u ] =
L(x, u, u0 ) dx
(20.11)
a

subject to certain prescribed boundary conditions. The integrand L(x, u, p) is known as the
Lagrangian for the variational problem, in honor of JosephLouis Lagrange, who was one of
the founders of the subject. We usually assume that L(x, u, p) is a reasonably smooth func0
tion of all three of its (scalar) arguments x, u and p, which represents the derivative
pu . For
example, the arc length functional (20.3) has Lagrangian function L(x, u, p) = p1 + p2 ,
whereas in the surface of revolution problem (20.10), we have L(x, u, p) = 2 | u | 1 + p2 .
(In the latter case, the points where u = 0 are slightly problematic, since L is not continuously differentiable there.)
In order to uniquely specify a minimizing function, we must impose suitable boundary
conditions. All of the usual suspects Dirichlet (fixed), Neumann (free), as well as mixed
and periodic boundary conditions that arose in Chapter 10 are also of interest here. In
the interests of brevity, we shall concentrate on the Dirichlet boundary conditions
u(a) = ,

u(b) = ,

(20.12)

as these are the most common in physical problems, although some of the exercises will
investigate other types.
The First Variation and the EulerLagrange Equation
According to Section 18.3, the (local) minimizers of a (suffciently nice) function defined on a finite-dimensional vector space are initially characterized as critical points,
3/7/03

886

c 2003

Peter J. Olver

where the gradient of the function vanishes. An analogous construction applies in the
infinite-dimensional context treated by the calculus of variations. Every minimizing function of a suffciently nice functional J[ u ] is a critical function, meaning a function where
the functional gradient J[ u ] = 0 vanishes. Indeed, the justification of this result that
was outlined in Section 18.3 continues to apply here; see, in particular, the proof of Theorem 18.42. Of course, not every critical point turns out to be a minimum. In nondegenerate
situations, the classification of critical points into local minima, maxima, or saddle points,
relies on the second derivative test. The functional version of the second derivative test
the second variation is the topic of Section 20.3.
Thus, our first order of business is to learn how to compute the gradient of a functionals on an infinite-dimensional function space. Adapting the general Definition 18.38
of the gradient of a function defined on an inner product space, the gradient J[ u ] of the
functional (20.11) should be defined by the same basic formula

d
h J[ u ] ; v i =
J[ u + t v ]
.
(20.13)
dt
t=0

Here v(x) is a function the direction in which the derivative is computed. Classically,
v is known as a variation in the function u, sometimes written v = u, whence the term
calculus of variations. The gradient operator on functionals is often referred to as the
variational derivative. The inner product used in (20.13) is taken (again for simplicity) to
be the standard L2 inner product
Z b
hf ;gi =
f (x) g(x) dx
(20.14)
a

on function space.
Now, starting with (20.11), we have
J[ u + t v ] =

L(x, u + t v, u0 + t v 0 ) dx.

(20.15)

We need to compute the derivative of the integral with respect to t. Assuming smoothness
of the integrand allows us to bring the derivative inside the integral and so, by the chain
rule,
Z b
d
d
J[ u + t v ] =
L(x, u + t v, u0 + t v 0 ) dx
dt
dt
a

Z b
L
0
0
0 L
0
0
=
v
(x, u + t v, u + t v ) + v
(x, u + t v, u + t v ) dx.
u
p
a
Therefore, setting t = 0 to evaluate (20.13), we find

Z b
L
0
0 L
0
h J[ u ] ; v i =
(x, u, u ) + v
(x, u, u ) dx.
v
u
p
a

(20.16)

The resulting integral often referred to as the first variation of the functional J[ u ].
3/7/03

887

c 2003

Peter J. Olver

The right hand side of (20.16) needs to be written as an inner product,


h J[ u ] ; v i =

b
a

J[ u ] v dx =

h v dx

between some function h(x) = J[ u ] and the variation v. The first term has this form,
but the derivative v 0 appearing in the second term is problematic. However, as the reader
of Chapter 10 already knows, the secret behind removing derivatives in an integral formula
is integration by parts. If we set
L
(x, u(x), u0 (x)) r(x),
p
we can re-express the offending term as
Z b
Z

0
r(x) v (x) dx = r(b) v(b) r(a) v(a)
a

r0 (x) v(x) dx,

(20.17)

where again by the chain rule

2L
2L
2L
d L
0
0
(x, u, u ) =
(x, u, u0 ) + u0
(x, u, u0 ) + u00
(x, u, u0 ) .
r (x) =
2
dx p
x p
x u
p
(20.18)
So far we have not imposed any conditions on our variation v(x). We are comparing the
values of J[ u ] only among the functions that satisfy the prescribed boundary conditions,
namely
u(a) = ,
u(b) = .
Therefore, we must make sure that the varied function u
b(x) = u(x) + t v(x) remains within
this space of functions, and so it must satisfy the same boundary conditions u
b(a) = ,
u
b(b) = . But u(x) already satisfies the boundary conditions, and so the variation v(x)
must satisfy the corresponding homogeneous boundary conditions
v(a) = 0,

v(b) = 0.

(20.19)

As a result, both boundary terms in our integration by parts formula (20.17) vanish, and
we can write (20.16) as

Z b
Z b
L
d L
0
0
h J[ u ] ; v i =
J[ u ] v dx =
v
(x, u, u )
(x, u, u )
dx.
u
dx p
a
a
We conclude that
L
d
(x, u, u0 )
J[ u ] =
u
dx

L
(x, u, u0 )
p

(20.20)

This is our explicit formula for the functional gradient or variational derivative of the
functional (20.11) with Lagrangian L(x, u, p). Note that the gradient J[ u ] of a functional
is a function.
3/7/03

888

c 2003

Peter J. Olver

The critical functions u(x) which include all local minimizers are, by definition,
where the functional gradient vanishes: J[ u ] = 0. Thus, u(x) must satisfy
L
d L
(x, u, u0 )
(x, u, u0 ) = 0.
u
dx p

(20.21)

In view of (20.18), we see that (20.21) is, in fact, a second order ordinary differential
equation,
E(x, u, u0 , u00 ) =

L
2L
2L
2L
(x, u, u0 ) = 0,
(x, u, u0 )
(x, u, u0 ) u0
(x, u, u0 ) u00
u
x p
x u
p2

known as the EulerLagrange equation associated with the variational problem (20.11).
Any solution to the EulerLagrange equation that is subject to the assumed boundary
conditions forms a critical point for the functional, and hence is a potential candidate for
the desired minimizing function. And, in many cases, the EulerLagrange equation suffices
to characterize the desired minimizer without further ado.
Theorem 20.1. Suppose the Lagrangian function is at least twice continuously differentiable: L(x, u, p) C2 . Then any C2 minimizer u(x) to the corresponding functional
Z b
L(x, u, u0 ) dx must satisfy the associated EulerLagrange equation (20.21).
J[ u ] =
a

Let us now investigate what the EulerLagrange equation tells us about the examples
of variational problems presented at the beginning of this section. One word of warning: there do exist seemingly reasonably functionals whose minimizers are not, in fact,
C2 , and hence do not solve the EulerLagrange equation; see [BallMizel] for examples.
Fortunately, in the problems we usually consider, such pathologies do not appear.
Curves of Shortest Length

Consider the problem of finding the curve of shortest length connecting two points
a = (a, ), b = (b, ) R 2 in the plane. As we saw in Section 20.2, this requires minimizing
the arc length integral
Z bp
p
1 + (u0 )2 dx
J[ u ] =
with Lagrangian
L(x, u, p) = 1 + p2 .
a

Since

L
p
=p
,
p
1 + p2

L
= 0,
u

the EulerLagrange equation (20.21) in this case takes the form


d
u0
u00
p
0 =
=
.
dx 1 + (u0 )2
(1 + (u0 )2 )3/2

Since the denominator does not vanish, the EulerLagrange equation reduces to the simplest second order ordinary differential equation
u00 = 0.
3/7/03

889

(20.22)
c 2003

Peter J. Olver

All solutions to the EulerLagrange equation are affine functions, u = c x+d, whose graphs
are straight lines. Since our solution must also satisfy the boundary conditions = u(a),
= u(b), the only critical function, and hence the sole candidate to be a minimizer, is the
unique straight line

y=
(x a) +
(20.23)
ba
passing through the two points. Thus, the EulerLagrange equation helps to reconfirm
our intuition that straight lines minimize distance.
Be that as it may, the fact that a function satisfies the EulerLagrange equation
and the boundary conditions merely gives it the status of a candidate for minimizing
the variational problem. By the same token, a critical function is also a candidate for
maximizing the variational problem, too. The nature of the critical functions can only be
distinguished by a second derivative test, which requires further work. Of course, for the
present problem, we know that a straight line cannot maximize distance, and must be
the minimizer. Nevertheless, the reader should have a little nagging doubt that we have
completely solved the minimum distance problem . . .
Minimal Surface of Revolution
Consider next the problem of finding the curve connecting two points having a surface
of revolution of minimal surface area. For simplicity, we assume that the curve is given
by the graph of a non-negative function y = u(x) 0. According to (20.10), the required
curve will minimize the functional
Z b p
p
with Lagrangian
L(x, u, p) = u 1 + p2 , (20.24)
J[ u ] =
u 1 + (u0 )2 dx,
a

where we have dropped an irrelevant factor of 2 and used our positivity assumption to
omit the absolute value on u in the integrand. Since
L p
L
up
,
= 1 + p2 ,
=p
u
p
1 + p2
the EulerLagrange equation (20.21) is

p
1 + (u0 )2 u u00
u u0
d
p
= 0.
=
1 + (u0 )2
dx 1 + (u0 )2
(1 + (u0 )2 )3/2

(20.25)

Therefore, to find the critical functions, we need to solve a nonlinear second order ordinary
differential equation and not one in a familiar form.
Fortunately, there is a little trick we can use to find the solution. If we multiply by
0
u , then we can rewrite the result as an exact derivative

1 + (u0 )2 u u00
d
u
0
p
u
=
= 0.
0
2
3/2
dx 1 + (u0 )2
(1 + (u ) )

Actually, as with many tricks, this is really an indication that something profound is going on.
Noethers Theorem, a result of fundamental importance in modern physics that relates symmetries
and conservation laws, [ 54, 97 ], underlies the integration method. See also Exercise .

3/7/03

890

c 2003

Peter J. Olver

We conclude that

u
p
= c,
1 + (u0 )2

(20.26)

where c is a constant of integration. The left hand side of (20.26), being constant on the
entire solution, is a first integral for the differential equation, cf. Definition 19.25. The
resulting equation is an implicit form of an autonomous first order differential equation.
Solving for

u2 c 2
du
0
=u =
dx
c
leads to an autonomous first order ordinary differential equation, which we can immediatley
solve:
Z
c du

= x + ,
u2 c 2
where is a constant of integration. According to Exercise , the most useful form of the
integral is in terms of the inverse to the hyperbolic function cosh z = 12 (ez + ez ), whereby
cosh

u
= x + ,
c

and hence

u = c cosh

x+
c

(20.27)

In this manner, we have produced the general solution to the EulerLagrange equation
(20.25). Any solution that also satisfies the boundary conditions provides a critical function
for the surface area functional (20.24), and hence is a candidate for the minimizer.
The curve prescribed by the graph of a hyperbolic cosine function (20.27) is known
as a catenary. It is not a parabola, even though to the untrained eye it looks similar.
Interestingly, the catenary is the same profile as a hanging chain. Owing to their minimizing
properties, catenaries are quite common in engineering design for instance the cables
in a suspension bridge such as the Golden Gate Bridge are catenaries, as is the arch in
St. Louis.
So far, we have not taken into account the boundary conditions u(a) = and u(b) = .
It turns out that there are three distinct possibilities, depending upon the configuration
of the boundary points:
(a) There is precisely one value of the two integration constants c, that satisfies the
two boundary conditions. In this case, it can be proved that this catenary is the
unique curve that minimizes the area of its associated surface of revolution.
(b) There are two different possible values of c, that satisfy the boundary conditions.
In this case, one of these is the minimizer, and the other is a spurious solution
one that corresponds to a saddle point for the functional.
(c) There are no values of c, that allow (20.27) to satisfy the two boundary conditions.
This occurs when the two boundary points a, b are relatively far apart. In this
configuration, the physical soap film spanning the two circular wires breaks apart
into two circular disks, and this defines the minimizer for the problem, i.e., there is
no surface of revolution that has a smaller surface area than the two disks. (In the
first two cases, this is not valid; the minimizing catenary has a smaller surface area
3/7/03

891

c 2003

Peter J. Olver

than the two disks.) However, the function that minimizes this configuration
consists of two vertical lines from a and b to the x axis along with the portion of
the axis lying between them. We can approximate this function by a sequence of
genuine functions that give progressively smaller and smaller values to the surface
area functional (20.10), but the actual minimum is not attained among the class
of (smooth) functions.
Thus, even in such a reasonably simple example, a number of the subtle complications arising in the calculus of variations can already be seen. Lack of space precludes a
more detailed development of these ideas here, and we refer the interested reader to more
specialized books devoted to the calculus of variations, including [31, 54].
The Brachistrochrone Problem
The most famous classical variational problem is the so-called brachistochrone problem.
The word brachistochrone means minimal time in Latin. An experimenter lets a bead
slide down a wire that connects two prescribed points. The goal is to shape the wire in such
a way that, starting from rest, the bead slides from one end to the other in minimal time.
Nave guesses for the wires shape, including a straight line, a parabola, and a circular
arc, are wrong. One can do better through a coreful analysis of the associated variational
problem. The brachistochrone problem was originally posed by Johann Bernoulli in 1696,
and served as an inspiration for much of the subsequent development of the subject.
We take the starting point of the bead at the origin: a = (0, 0). The wire will
bend downwards, and to avoid annoying minus signs in the subsequent formulae, we take
out vertical y axis to point downwards, so the wire has the shape given by the graph of
y = u(x) > 0. The end point b = (b, ) is assumed to lie below and to the right, and
so b > 0 and > 0; the set-up is sketched in Figure brach . The first step is to find the
formula for the transit time of the bead sliding along the wire. Arguing as in our derivation
of the optics functional (20.5), if v denotes the speed of descent of the bead, then the total
travel time is
Z bp
1 + (u0 )2
T[u] =
dx.
(20.28)
v
0
We shall use conservation of energy to determine a formula for the speed v as a function
of the position along the wire.
The kinetic energy of the bead is 21 m v 2 , where m is its mass and v 0 its speed of
descent. On the other hand, due to our sign convention, the potential energy of the bead
when it is at height y is m g y, where m is its mass and g the gravitational force, and
we take the initial height y = 0 as the zero potential energy level. The bead is initially
at rest, with 0 kinetic energy and 0 potential energy. Assuming that frictional forces are
negligible, conservation of energy implies that
0=

1
2

m v 2 m g y.

Here function must be taken in a very broad sense, as this situation does not even correspond to a generalized function!

3/7/03

892

c 2003

Peter J. Olver

We can solve this equation to determine the beads speed as a function of its height:
p
v = 2g y .
(20.29)

Substituting this expression into (20.28), we conclude that the shape y = u(x) of the wire
is obtained by minimizing the functional
Z bs
1 + (u0 )2
dx.
(20.30)
T[u] =
2g u
0
The associated Lagrangian is
L(x, u, p) =

1 + p2
,
u

where we omit an irrelevant factor of 2 g (or adopt physical units in which g =


compute
p
L
1 + p2
p
L
,
=
= p
.
3/2
u
p
2u
u (1 + p2 )

1
2 ).

We

Therefore, the EulerLagrange equation for the brachistochrone functional (20.30) is


p
1 + (u0 )2
d
2 u u00 + (u0 )2 + 1
u0
p
p

= 0,

dx u (1 + (u0 )2 )
2 u3/2
2 u (1 + (u0 )2 )

and is equivalent to the nonlinear second order ordinary differential equation


2 u u00 + (u0 )2 + 1 = 0.

Rather than try to solve this differential equation directly, we note that the Lagrangian
does not depend upon x, and therefore we can use the result of Exercise that states that
the Hamiltonian
L
1
=p
H(x, u, p) = L p
p
u(1 + p2 )
is a first integral, and hence

1
p
= k,
u(1 + (u0 )2 )

which we rewrite as

u(1 + (u0 )2 ) = c,

where c = 1/k 2 is a constant. Solving for the derivative u0 results in the first order
autonomous ordinary differential equation
r
cu
du
=
.
dx
u
This equation can be explicitly solved by separation of variables, and so, integrating from
the initial point x = u = 0,
Z r
u
du = x.
cu
0
3/7/03

893

c 2003

Peter J. Olver

The integration can be done by use of a trigonometric substitution, namely


Z
1
whereby
x=
cot 12 r dr = 21 c(r sin r).
u = 2 c (1 cos r),

(20.31)

The resulting pair of equations (20.31) serve to parametrize a curve (x(r), u(r)) known as a
cycloid . According to Exercise , a cycloid can be visualized as the curve that is traced by
a point sitting on the edge of a rolling wheel; see Exercise for details. Thus, all solutions
to the EulerLagrange equation are the cycloids, described in parametric form by (20.31).
Any cycloid which satisfies the boundary conditions suplies us with a critical function, and
hence a candidate for the solution to the brachistochrone minimization problem.
With a little more work, it can be proved that there is precisely one value of the
integration constant c that satisfies the two boundary conditions, and, moreover, that
this particular cycloid minimizes the brachistochrone functional. Examples are plotted in
Figure cyc . Interestingly, in certain configurations, namely if < 2 b/, the curve dips
below the lower endpoint b.

20.3. The Second Variation.


The solutions to the EulerLagrange boundary value problem are the critical functions
for the variational principle, meaning that they cause the functional gradient to vanish.
In the finite-dimensional theory, being a critical point is only a necessary condition for
minimality. One must impose additional conditions, based on the second derivative of the
objective function at the critical point, in order to guarantee that it is a minimum and not
a maximum or saddle point. Similarly, in the calculus of variations, the solutions to the
EulerLagrange equation may also include (local) maxima, as well as other non-extremal
critical functions. To distinguish between the different possible solutions, we need to
formulate a second derivative test for the objective functional on an infinite-dimensional
function space. In the calculus of variations, the second derivative of a functional is
known as its second variation, the EulerLagrange expression being also known as the first
variation.
In the finite-dimensional version, the second derivative test was based on the positive definiteness of the Hessian matrix. The justification relied on a second order Taylor
expansion of the objective function at the critical point. Thus, in an analogous fashion,
we expand the objective functional J[ u ] near the critical function. Consider the scalar
function g(t) = J[ u + t v ], where the function v(x) represents a variation. The second
order Taylor expansion of g(t) takes the form
g(t) = J[ u + t v ] = J[ u ] + t K[ u; v ] +

1 2
2 t

Q[ u; v ] + .

a negligible role, and can be ignored. The first order terms are linear in the variation v,
and given by an inner product
g 0 (0) = K[ u; v ] = h J[ u ] ; v i

between the variation and the functional gradient. In particular, if u = u ? is a critical


function, then the first order terms vanish,
K[ u? ; v ] = h J[ u? ] ; v i = 0
3/7/03

894

c 2003

Peter J. Olver

for all allowable variations v, meaning those that satisfy the homogeneous boundary conditions. Therefore, the nature of the critical function u? minimum, maximum, or neither
is, in most cases, determined by the second derivative terms
g 00 (0) = Q[ u? ; v ].
As in the finite-dimensional Theorem 18.45, if u is a minimizer, then Q[ u; v ] 0. Conversely, if Q[ u; v ] > 0 for v 6 0, i.e., the second derivative terms satisfy a condition of
positive definiteness, then u will be a strict local minimizer. This forms the crux of the
second derivative test.
Let us explicitly evaluate the second derivative terms for the simplest variational
problem (20.11). We need to expand the scalar function
g(t) = J[ u + t v ] =

L(x, u + t v, u0 + t v 0 ) dx
a

in a Taylor series around t = 0. The linear terms in t were already found in (20.16), and
so we need to compute the quadratic terms:
00

Q[ u; v ] = g (0) =

b
a

where the coefficient functions


A(x) =

2L
(x, u, u0 ) ,
u2

B(x) =

A v 2 + 2 B v v 0 + C (v 0 )2 dx,

2L
(x, u, u0 ) ,
u p

C(x) =

(20.32)

2L
(x, u, u0 ) ,
p2

(20.33)

are found by evaluating certain second order derivatives of the Lagrangian at the critical
function u(x). The quadratic functional (20.32) is known as the second variation of the
original functional J[ u ], and plays the role of the Hessian matrix for functionals. In
contrast to the first variation, it is not possible to eliminate all of the derivatives on v
in the quadratic functional (20.32) through integration by parts. This causes significant
complications for the analysis.
To formulate conditions that the critical function be a minimizer for the functional,
we need to determine when such a quadratic functional is positive definite, meaning that
Q[ u; v ] > 0 for all non-zero allowable variations v(x) 6 0. Clearly, if the integrand is
positive definite at each point, so
A(x) v 2 + 2 B(x) v v 0 + C(x) (v 0 )2 > 0

for all

a x b,

(v, v 0 ) 6= 0,

(20.34)

then the second variation Q[ u; v ] is also positive definite.


Examplep
20.2. For the arc length minimization functional (20.3), the Lagrangian
is L(x, u, p) = 1 + p2 . To analyze the second variation, we first compute
2L
= 0,
u2

3/7/03

2L
= 0,
u p
895

2L
1
.
=
2
p
(1 + p2 )3/2

c 2003

Peter J. Olver

For the critical straight line function u = u? given in (20.23), we evaluate at p = u0 =


( )/(b a), and so
A(x) =

2L
= 0,
u2

B(x) =

2L
= 0,
u p

C(x) =

2L
(b a)3
=
k

3/2 .

p2
(b a)2 + ( )2

Therefore, the second variation functional (20.32) is


Z b
?
Q[ u ; v ] =
k (v 0 )2 dx,
a

where k > 0 is a positive constant. Thus, Q[ u? ; v ] = 0 vanishes if and only if v is a


constant function. But the variation v is required to satisfy the homogeneous boundary
conditions v(a) = v(b) = 0, and hence the functional is positive definite for all allowable
nonzero variations. Therefore, we can finally conclude that the straight line is, indeed, a
(local) minimizer for the arc length functional. We have at last justified our intuition that
the shortest distance between two points is a straight line!
In general, as the following example points out, the pointwise positivity condition
(20.34) is overly restrictive.
Example 20.3. Consider the quadratic functional
Z 1
0 2

(v ) v 2 dx.
Q[ v ] =

(20.35)

The claim is that Q[ v ] > 0 is positive definite for all nonzero v 6 0 subject to homogeneous
Dirichlet boundary conditions v(0) = 0 = v(1). This result is not trivial! Indeed, the
boundary conditions play an essential role, since choosing v(x) c to be any constant
function will produce a negative value for the functional: Q[ v ] = c 2 .
To prove the claim, consider the quadratic functional
Z 1
e
Q[ v ] =
(v 0 + v tan x)2 dx 0,
0

which is clearly positive semi-definite since the integrand is everywhere 0; moreover, the
integral vanishes if and only if v satisfies the first order linear ordinary differential equation
v 0 + v tan x = 0,

for all

0 x 1.

The only solution that also satisfies boundary condition v(0) = 0 is the trivial one v 0.
e v ] = 0 if and only if v 0, and hence Q
e > 0 is a positive definite
We conclude that Q[
quadratic functional.
Leet us expand the latter functional,
Z 1
0 2

e
(v ) + 2 v v 0 tan x + v 2 tan2 x dx
Q[ v ] =
=

3/7/03

0 2

(v ) v (tan x) + v tan x dx =
896

(v 0 )2 v 2 dx = Q[ v ].
c 2003

Peter J. Olver

In the second equality, we integrated the middle term by parts, using (v 2 )0 = 2 v v 0 , and
e v ] is positive definite, so is Q[ v ], justifying
noting that the boundary terms vanish. Since Q[
the previous claim.
To see how subtle this result is, consider the almost identical quadratic functional
Z 4
0 2

b v] =
Q[
(v ) v 2 dx.
(20.36)
0

The only difference is in the upper limit to the integral. A quick computation shows that
the function v(x) = x(4 x) satisfies the homogeneous Dirichlet boundary conditions
v(0) = 0 = v(4), but
Z 4

b v] =
(4 2 x)2 x2 (4 x)2 dx = 128
Q[
5 < 0.
0

b v ] is not positive definite. Our preceding analysis does not apply beTherefore, Q[
cause the function tan x becomes singular at x = 12 , and so the auxiliary integral
Z 4
(v 0 + v tan x)2 dx does not converge.
0

The complete analysis of positive definiteness of quadratic functionals is quite subtle.


The strange appearance of tan x in this particular example turns out to be an important
clue! In the interests of brevity, let us just state without proof a fundamental theorem,
and refer the interested reader to [54] for full details.
Theorem 20.4. Let A(x), B(x), C(x) C0 [ a, b ] be continuous functions. The quadratic functional
Z b

A v 2 + 2 B v v 0 + C (v 0 )2 dx
Q[ v ] =
a

is positive definite, so Q[ v ] > 0 for all v 6 0 satisfying the homogeneous Dirichlet boundary
conditions, provided
(a) C(x) > 0 for all a x b, and
(b) For any a < c b, the only solution to the associated linear EulerLagrange
boundary value problem
(C w0 )0 + (A B 0 ) w = 0,

w(a) = 0 = w(c),

(20.37)

is the trivial function w(x) 0.


Remark : A value c for which (20.37) has a nontrivial solution is known as a conjugate
point to a. Thus, condition (b) can be restated that the variational problem has no
conjugate points in the interval [ a, b ].
Example 20.5. The quadratic functional
Z b
0 2

Q[ v ] =
(v ) v 2 dx

(20.38)

3/7/03

897

c 2003

Peter J. Olver

has EulerLagrange equation


w00 w = 0.
The solutions w(x) = k sin x satisfy the boundary condition w(0) = 0. The first conjugate
point occurs at c = where w() = 0. Therefore, Theorem 20.4 implies that the quadratic
functional (20.38) is positive definite provided the upper integration limit b < . This
explains why the first quadratic functional (20.35) is positive definite, since there are no
conjugate points on the interval [ 0, 1 ], while the second (20.36) is not because the first
conjugate point lies on the interval [ 0, 4 ].
In the case when the quadratic functional arises as the second variation of a functional
(20.11), then the coefficient functions A, B, C are given in terms of the Lagrangian L(x, u, p)
by formulae (20.33). In this case, the first condition in Theorem 20.4 requires
2L
(x, u, u0 ) > 0
p2
for the minimizer u(x). This is known as the Legendre condition. The second, conjugate
point condition requires that the so-called linear variational equation
2
2

d
L
L
d 2L
0 dw
0
0

+
(x, u, u )
(x, u, u )
(x, u, u ) w = 0
(20.39)
dx p2
dx
u2
dx u p
has no nontrivial solutions w(x) 6 0 that satisfy w(a) = 0 and w(c) = 0 for a < c b.

20.4. Multi-dimensional Variational Problems.


The calculus of variations encompasses a very broad range of mathematical applications. The methods of variational analysis can be applied to an enormous variety of
physical systems, in which the equilibrium configurations minimize a suitable functional
typically, the potential energy of the system. The minimizing configurations are among
the critical points of the functional where its functional gradient vanishes. Following similar
computational procedures as in the simple one-dimensional version, we find that the critical
functions are characterized as solutions to a system of partial differential equations, called
the EulerLagrange equations associated with the variational principle. Each solution to
the boundary value problem specified by the EulerLagrange equations is, thus, a candidate minimizer for the variational problem. In many applications, the EulerLagrange
equations suffice to single out the desired physical solutions, and one does not continue on
to the considerably more difficult second variation.
Implementation of the variational calculus for functionals in higher dimensions will be
illustrated by looking at a specific example a first order variational problem involving
a single scalar function of two variables. Thus, we consider a functional in the form
ZZ
J[ u ] =
L(x, y, u, ux , uy ) dx dy,
(20.40)

of a double integral over a prescribed domain R 2 . The Lagrangian L(x, y, u, p, q) is


assumed to be a sufficiently smooth function of its five arguments. Our goal is to find the
3/7/03

898

c 2003

Peter J. Olver

function(s) u = f (x, y) that minimize the given functional among all sufficiently smooth
functions that satisfy a set of prescribed boundary conditions on . The most important
are our usual Dirichlet, Neumann and mixed boundary conditions. For simplicity, we
concentrate on the Dirichlet boundary value problem
u(x, y) = g(x, y)

for

(x, y) .

(20.41)

The First Variation


The basic necessary condition for an extremum (minimum or maximum) is obtained
in precisely the same manner as in the one-dimensional framework. Consider the function
ZZ
g(t) J[ u + t v ] =
L(x, y, u + t v, ux + t vx , uy + t vy ) dx dy

for t R. The variation v(x, y) is assumed to satisfy homogeneous Dirichlet boundary


conditions
v(x, y) = 0
for
(x, y) ,
(20.42)

to ensure that u + t v satisfies the same boundary conditions (20.41) as u itself. Under
these conditions, if u is a minimizer, then the scalar function g(t) will have a minimum
at t = 0, and hence g 0 (0) = 0. When computating g 0 (t), we assume that the functions
involved are sufficiently smooth so as to allow us to bring the derivative d/dt inside the
integral and then apply the chain rule. At t = 0, the result is

ZZ

L
d
L
L
0

=
v
J[ u + t v ]
+ vx
+ vy
dx dy,
(20.43)
g (0) =
dt
u
p
q

t=0

where the derivatives of L are evaluated at x, y, u, ux , uy . To identify the functional gradient, we need to rewrite this integral in the form of an inner product
ZZ
0
g (0) = h J[ u ] ; v i =
h(x, y) v(x, y) dx dy,
where
h = J[ u ].

As before, we need to remove the offending derivatives from v. In two dimensions, the
requisite integration by parts formula

ZZ
I
ZZ
v
v
w2
w1
dx dy,
w1 +
w dx dy =
v ( w2 dx + w1 dy)
v
+
y 2
x
y
x

(20.44)
in which w1 , w2 are arbitrary smooth functions, appears in (14.79). Setting w 1 = L/p, w2 =
L/q, we find

ZZ
ZZ
L

L
L
L

+ vy
vx
dx dy =
v

dx dy,
p
q
x p
y q

where the boundary integral vanishes when v(x, y) satisfies the homogeneous Dirichlet
boundary conditions (20.42) that we impose on the allowable variations. Substituting this
result back into (20.43), we conclude that

ZZ

L
L
L
0
g (0) =
v

dx dy = 0.
(20.45)
u
x p
y q

3/7/03

899

c 2003

Peter J. Olver

The quantity in brackets is desired first variation or functional gradient:

L
L
L

J[ u ] =

u
x p
y q
which must vanish at a critical function. We conclude that the minimizer u(x, y) must
satisfy the EulerLagrange equation

L
L
L
(x, y, u, ux , uy )
(x, y, u, ux , uy )
(x, y, u, ux , uy )
= 0.
u
x p
y q
(20.46)
Once we explicitly evaluate the deriatives, the net result is a second order partial differential
equation
Lu Lxp ux Lup uy Luq uxx Lpp 2 uxy Lpq uyy Lqq ,
(20.47)
where we use subscripts to indicate derivatives of both u and L, the latter being evaluated
at x, y, u, ux , uy . Solutions to the EulerLagrange equation are critical functions for the
variational problem, and hence include any local and global minimizers. Determination of
which solutions are geniune minima requires a further analysis of the positivity properties
of the second variation, which is beyond the scope of our introductory treatment. Indeed,
a complete analysis of the positive definiteness of the second variation of multi-dimensional
variational problems is very complicated, and still awaits a completely satisfactory resolution!
Example 20.6. As a first elementary example, consider the Dirichlet minimization
problem
ZZ
2

2
1
J[ u ] =
(20.48)
2 ux + uy dx dy

that we first encountered in our analysis of the solutions to the Laplace equation (14.91).
In this case, the associated Lagrangian is
L = 21 (p2 + q 2 ),

with

L
= 0,
u

L
= p = ux ,
p

L
= q = uy .
q

Therefore, the EulerLagrange equation (20.46) becomes

(ux )
(u ) = uxx uyy = u = 0,
x
y y

which is the two-dimensional Laplace equation. Subject to the boundary conditions, the
solutions, i.e., the harmonic functions, are the critical functions for the Dirichlet variational principle. This reconfirms the Dirichlet characterization of harmonic functions as
minimizers of the variational principle, as stated in Theorem 14.13. However, the calculus
of variations approach, as developed so far, leads to a much weaker result since it only
singles out the harmonic functions as candidates for minimizing the Dirichlet integral; they
could just as easily be maximing functions or saddle points. In the quadratic case, the
direct algebraic approach is, when applicable, the more powerful, since it assures us that
the solutions to the Laplace equation really do minimize the integral among the space of
3/7/03

900

c 2003

Peter J. Olver

functions satisfying the appropriate boundary conditions. However, the direct method is
restricted to quadratic variational problems, whose EulerLagrange equations are linear
partial differential equations. In nonlinear cases, one really does need to utilize the full
power of the variational machinery.
Example 20.7. Let us derive the EulerLagrange equation for the minimal surface
problem. From (20.8), the surface area integral
ZZ p
p
1 + u2x + u2y dx dy
has Lagrangian
L = 1 + p2 + q 2 .
J[ u ] =

Note that

L
= 0,
u

L
p
,
=p
p
1 + p2 + q 2

L
q
.
=p
q
1 + p2 + q 2

Therefore, replacing p ux and q uy and then evaluating the derivatives, the Euler
Lagrange equation (20.46) becomes
(1 + u2y )uxx + 2ux uy uxy (1 + u2x )uyy
uy

ux
p
p

= 0.
x 1 + u2 + u2 y 1 + u2 + u2
(1 + u2x + u2y )3/2
y
y
x
x

Thus, a surface described by the graph of a function u = f (x, y) is a candidate for minimizing surface area provided it satisfies the minimal surface equation
(1 + u2y ) uxx 2 ux uy uxy + (1 + u2x ) uyy = 0.

(20.49)

Thus, we are confronted with a complicated, nonlinear, second order partial differential
equation, which has been the focus of some of the most sophisticated and deep analysis
over the preceding two centuries, with significant progress on understanding its solution
only within the past 70 years. We have not developed the sophisticated analytical and
numerical techniques that are required to have anything of substance to say about its
solutions here, and will refer the interested reader to the advanced texts [91, 93].
Example 20.8. The small deformations of an elastic body R n are described by
the displacement field, u: R n . Each material point x in the undeformed body
e = { x + u(x) | x }.
will move to a new position x + u(x) in the deformed body
The one-dimensional case governs bars, beams and rods, two-dimensional bodies include
thin plates and shells, while n = 3 for fully three-dimensional solid bodies. See [8, 56] for
details and physical derivations.
For small deformations, we can use a linear theory to approximate the much more
complicated equations of nonlinear elasticity. The simplest case is that of an isotropic,
homogeneous planar body R 2 , i.e., a thin plate. The equilibrium mechanics are
T
described by the deformation function u(x) = ( u(x, y), v(x, y) ) . A detailed physical
analysis of the constitutive assumptions leads to the following minimization principle
ZZ

1
2
2
1
dx dy
J[ u, v ] =
2 k u k + 2 ( + )( u)

ZZ
(20.50)
2

1
2
2
2
1
=
2 + (ux + vy ) + 2 (uy + vx ) + ( + ) ux vy dx dy.

3/7/03

901

c 2003

Peter J. Olver

The parameters , are known as the Lame moduli of the material, and govern its intrinsic
elastic properties. They are measured by performing suitable experiments on a sample of
the material. Physically, (20.50) represents the stored (or potential) energy in the body
under the prescribed displacement. Nature, as always, seeks the displacement that will
minimize the total energy.
To compute the EulerLagrange equations, we consider the functional variation g(t) =
J[ u + t f, v + t g ], in which the individual variations f, g are arbitrary functions subject
only to the given homogeneous boundary conditions. If u, v minimize J, then g(t) has a
minimum at t = 0, and so we are led to compute
ZZ
0
g (0) = h J ; f i =
(f u J + g v J) dx dy,

which we write as an inner product (using the standard L2 inner product between vector
T
fields) between the variation f and the functional gradient J = ( u J, v J ) . For the
particular functional (20.50), we find
ZZ

0
+ (ux fx + vy gy ) + (uy fy + vx gx ) + ( + ) (ux gy + vy fx ) dx dy.
g (0) =

We use the integration by parts formula (20.44) to remove the derivatives from the variations f, g. Discarding the boundary integrals, which are used to prescribe the allowable
boundary conditions, we find
!

ZZ
(
+
2
)
u
+

u
+
(
+
)
v
f
+
yy
xy

xx
dx dy.
g 0 (0) =
+ ( + ) uxy + vxx + ( + 2 ) vyy g

The two terms in brackets give the two components of the functional gradient. Setting
them equal to zero, we derive the second order linear system of EulerLagrange equations
( + 2 ) uxx + uyy + ( + ) vxy = 0,

( + ) uxy + vxx + ( + 2 ) vyy = 0,


(20.51)
known as Naviers equations. The solutions are the critical displacements (u, v) that are
potential energy minimizers.
Since we are dealing with a quadratic functional, a more detailed algebraic analysis will demonstrate that the solutions to Naviers equations are the minimizers for the
variational principle (20.50). Although only valid in a limited range of physical and kinematical conditions, the solutions to the planar Naviers equations and its three-dimensional
counterpart are successfully used to model a wide class of elastic materials.

3/7/03

902

c 2003

Peter J. Olver

Chapter 21
Nonlinear Partial Differential Equations
The last topic to be discussed in this book is the vast and active contemporary research area of nonlinear partial differential equations. Leaving aside quantum mechanics,
which remains a purely linear theory, most genuine physical systems are modeled by
nonlinear partial differential equations. Attempts to survey even a tiny fraction of such a
all-encompassing range of phenomena, methods, results, and mathematical developments,
are necessarily doomed to failure.
Historically, beyond a few examples and some significant efforts in differential geometry, fluid mechanics, and relativity, relatively little was known about nonlinear partial
differential equations. Most of the most basic phenomena that now drive modern-day
research, such as solitons, chaos, stability, etc., remained undetected, or only dimly outlined. The last fifty years has witnessed a remarkable blossoming of our understanding,
due in large part to the advent of large scale computing and significant advances in numerical methods for integating nonlinear systems. Numerical experimentation suddenly
exposed many unexpected phenomena, including chaos and solitons, to the light of day.
New analytical methods, new mathematical theories, and new computational algorithms
have precipitated this revolution in our understanding and study of nonlinear systems, an
activity that continues to grow in intensity and breadth. Each leap in computing power
and theoretical advances has led to yet deeper understanding of nonoinear phenomena,
but also points out how far we have yet to go.
Linear differential equations are relatively well-behaved. We already encountered some
of the possible phenomena in our study of nonlinear ordinary differential equations, and
nonlinear partial differential equations lead to yet more classes of possible behaviors. To
make sense of this bewildering variety of methods, equations, and results, it is essential
build upon a firm foundation of, first of all, linear systems theory, and secondly, nonlinear
algebraic and ordinary differential equations. In this chapter, we only have room to reveal
a couple of the more important ideas that have arisen from the study of nonlinear partial
differential equations. We treat a few important nonlinear partial differential equations,
arising in gas dynamics, fluid mechanics, and nonlinear dispersion and heat conduction.
Topics include shock waves, blow up, viscous limits, similarity solutions, and solitons. We
will only be able to consider nonlinear partial differential equations in a single space coordinate and time. Many of the multidimensional versions governing our three-dimensional
world remain on the cutting edge of contemporary research activity.
We arrange our topics according to the order of the underlying differential equation.
First order nonlinear partial differential equations govern nonlinear waves and vibrations.
Such nonlinear wave motions arise in gas dynamics, water waves, elastodynamics, chemical
3/7/03

903

c 2003

Peter J. Olver

reactions, flood waves in rivers, chromatography, traffic flow, and a range of biological and
ecological systems. One of the most important phenomena that has not yet appeared in our
study of linear partial differential equations is the break down of solutions by the formation
of shock waves; a striking example is the supersonic boom produced by an airplane that
breaks the sound barrier.
Second order partial differential equations govern nonlinear diffusion processes, including heat flow and population dynamics. The most important equation, Burgers equation,
can, surprisingly, be linearized by transforming it to the heat equation, and this accident
proves to be of considerable help in understanding the nonlinear diffusion processes, and
their limiting behavior as the diffusion parameter or viscosity tends to zero. Third order
partial differential equations arise in the study of dispersive wave motion, including water
waves, plasma waves and others. We first treat the linear dispersive model, contraxsting
it with the hyperbolic models we encountered earlier in the text. Finally, we introdcue the
rmarkable KortewegdeVries equation, which arises as a model for nonlinear water waves.
Despite its nonlinearity, it supports stable localized traveling wave solutions that maintain
their shape under collisions, and have been called solitons. The KortewegdeVries equation is an example of an integrable system, since it can be solved by an associated linear
problem. Chaos and integrability are the two great themes in modern nonlienr applied
mathematics, and the student is well-advised to pursue both in further studies.

21.1. Nonlinear Waves and Shocks.


Before attempting to tackle any nonlinear partial differential equations, we should
carefully review the solution to the simplest linear first order partial differential equation
the one-way or unidirectional wave equation
ut + c ux = 0.

(21.1)

First, assume that the wave velocity c is constant. According to Proposition 13.6, a solution
u(t, x) to this partial differential equation is constant along the characteristic lines of slope
dx
= c,
namely
x c t = constant
dt
As a consequence, the solutions are all of the form

(21.2)

u = p(x c t)
where p() is an arbitrary function of the characteristic variable = xc t. To a stationary
observer, the solution is a wave of unchanging form moving at velocity c. The case c > 0
corresponds to a wave that translates to the right, as illustrated in Figure 21.1.
Slightly more complicated, but still linear, is the wave equation
ut + c(x) ux = 0,

(21.3)

where the variable wave velocity c(x) depends upon the position of the wave. This equation
models unidirectional waves propagating through a non-uniform, but static medium. Generalizing the constant coefficient construction (21.2), we define the characteristic curves
3/7/03

904

c 2003

Peter J. Olver

0.5

0.5

0.5

0.2

0.4

0.6

0.8

1.2

1.4

0.2

0.4

0.6

0.8

1.2

1.4

0.2

-0.5

-0.5

-0.5

-1

-1

-1

Figure 21.1.

0.4

0.6

0.8

1.2

1.4

Traveling Wave of Constant Form.

for the wave equation (21.3) to be the solutions to the autonomous ordinary differential
equation
dx
= c(x).
(21.4)
dt
Thus, unlike the constant velocity version, the characteristic curves are not necessarily
straight lines in the (t, x) plane. Nevertheless, the same observation that solutions remain
constant along the characteristics continues to apply.
Proposition 21.1. The solutions to the linear wave equation (21.3) are constant on
the characteristic curves.
Proof : As usual, the easiest way to prove a quantity is constant is to show that its
derivative is zero. Let x(t) be a characteristic curve, i.e., a solution to (21.4), parametrized
by the time t. The value of a solution u(t, x) of the wave equation at the point (t, x(t))
on the given characteristic curve is h(t) = u(t, x(t)). Our goal is to prove that h(t) is a
constant function of t. To differentiate h(t) with respect to t, we invoke the chain rule:
d
u
dx u
u
u
dh
=
u(t, x(t)) =
(t, x(t)) +
(t, x(t)) =
(t, x(t)) + c(x(t))
(t, x(t)) = 0,
dt
dt
t
dt x
t
x
where we replace dx/dt by c(x) since we are assuming that x(t) is a characteristic curve,
and hence satisfies (21.4). The final combination of derivatives is zero whenever u solves
the wave equation (21.1). Therefore, h(t) = u(t, x(t)) is constant.
Q.E.D.
Since the characteristic curve differential equation (21.4) is autonomous, it can be
immediately solved:
Z
dx
h(x) =
= t + ,
c(x)
where is the constant of integration. Therefore, the characteristic curves are defined
implicitly by either of the equations
h(x) = t +

or

x = g(t + ),

(21.5)

where g = h1 is the inverse function.


Any function which is constant along the curves defined by (21.5) must be a function
of the characteristic variable = h(x) t. As a consequence, Proposition 21.1 implies

Note that the present definition of characteristic variable has changed slightly from the
constant velocity case.

3/7/03

905

c 2003

Peter J. Olver

that we can write the solution to the wave equation in the form
u = p(h(x) t),

(21.6)

where p() is an arbitrary function of the characteristic variable.


To find the solution that satsifies the given initial conditions
u(0, x) = f (x)

(21.7)

we merely substitute the solution (21.6), leading to an implicit equation


p(h(x)) = f (x),

p() = f h1 () = f [g()].

and hence

Graphically, the solution must be constant along each characteristic curve. Therefore, to
find the value of u(t, x), we determine where the characteristic curve passing through the
point (t, x) intersects the x axis; if this point is (0, y), then u(t, x) = u(0, y) = f (y). See
Figure ccx .
Example 21.2. Consider the equation
u
u
1
+ 2
= 0.
t
x + 1 x

(21.8)

The characteristic curves are the solutions to the first order ordinary differential equation
dx
1
= 2
.
dt
x +1
Integrating, we find

(x2 + 1) dx =

1
3

x3 + x = t + ,

and the resulting characteristic curves are plotted in Figure wcxx .


The general solution to the equation takes the form

u = p 13 x3 + x t ,

where p() is an arbitrary function of the characteristic variable = 13 x3 + x t. A typical


solution, corresponding to initial data u(t, 0) = is plotted in Figure wcxx . The fact that
the characteristic curves are not straight means that, although the wave remains constant
along each individual curve, a stationary observer will witness a variable profile as the
wave moves to the right. Waves coming in from negative values of x speed up as they
arrive at the origin, and then slow down once they pass. As a result, we observe the wave
first spreading out as it goes to the origin, and then once it passes contracting back down.
Example 21.3. Consider the equation
ut x ux = 0.

(21.9)

The characteristic curves are the solutions to


dx
= x,
dt
3/7/03

906

c 2003

Peter J. Olver

and so
x(t) = c e t ,

or

The solution takes the form

log | x | + t = .

u = p ( log | x | + t ) = P (x et ).
Therefore, for initial data
u(0, x) = f (x)

the solution is

u = f (x et ).

For example, the solution


u(t, x) =

1
e 2 t
=
(x et )2 + 1
x2 + e 2 t

corresponding to initial data u(t, 0) = f (x) = (x2 + 1)1 is plotted in See Figure wavecx .
Note that since the characteristic curves all converge on the t axis, the solution becomes
more and more concentrated at the origin.
A Nonlinear Wave Equation
One of the simplest possible nonlinear partial differential equations is the nonlinear
wave equation
ut + u ux = 0.
(21.10)
first systematically studied by Riemann . Since it appears in so many applications, this
equation goes under a variety of names in the literature, including the Riemann, inviscid
Burgers, dispersonless KortewegdeVries, and other equations.
The equation (21.10) has the form of a unidirectional wave equation u t + c ux = 0 in
which the wave velocity c = u depends, not on the position x, but rather on the magnitude
of the wave. Larger waves move faster, and overtake smaller waves. Waves of depression,
where u < 0, move in the reverse direction to the left.
Fortunately, the method of characteristics that was developed for linear wave equations
also works in the present nonlinear situation and leads to a complete solution to the
equation. Mimicking our previous construction, (21.4), let us define the characteristic
curves of the nonlinear wave equation (21.10) by the formula
dx
= u(t, x).
(21.11)
dt
In this case, the characteristics depend upon the solution, and so it appears that we will
be not able to specify the characteristics until we know the solution u(t, x). Nevertheless,

Actually, this is not quite correct, since it appears to imply that the solution is an even
function of x.

In addition to his contributions to complex analysis, partial differential equations and number
theory, Riemann also was the inventor of Riemannian geometry, which proved absolutely essential
for Einsteins theory of general relativity some 70 years later!

3/7/03

907

c 2003

Peter J. Olver

they maintain the critical property that the solution u is constant along its characteristic
curves.
To prove this claim, we assume that x = x(t) parametrizes a characteristic curve. We
need to show that h(t) = u(t, x(t)) is constant along the curve. As before, we differentiate
using the chain rule and (21.11), to deduce
dh
d
u
dx u
u
u
=
u(t, x(t)) =
(t, x(t))+
(t, x(t)) =
(t, x(t))+u(t, x(t))
(t, x(t)) = 0.
dt
dt
t
dt x
t
x
The final expression vanishes because u is assumed to solve the wave equation (21.10).
Since the derivative of h(t) = u(t, x(t)) is zero, this quantity must be a constant, as stated.
Now, since the solution u(t, x(t)) is constant on the characteristic curve, the right
hand side of its defining equation (21.11) is a constant. Therefore, the derivative dx/dt is
constant, and the characteristic curve is a straight line! Consequently, each characteristic
curve
x = u t + b,
is a straight line of slope u, the value of the solution on the line. The larger u is, the
steeper the characteristic line, and the faster the wave moves. The characteristic variable
= x u t depends upon the solution, which can be written in implicit form
u = f (x u t),

(21.12)

where f () is an arbitrary function of the characteristic variable. For example, if f ) =


+ is an affine function, then
u = (x u t) + ,

and hence

u(t, x) =

x +
.
1 + t

(21.13)

If > 0, this represents a straight line solution that gradually flattens out as t . On
the other hand, if < 0, the solution rapidly steepens to vertical as t t ? = 1/, at
which point the solution reaches a singularity.
To construct a solution u(t, x) to the initial value problem
u(0, x) = f (x)

(21.14)

we note that, at t = 0, the implicit solution formula formula (21.12) reduces to u(0, x) =
f (x), and hence the function f coincides with the initial data! However, because (21.12)
defines u(t, x) implicitly, it is not clear (a) whether it can be solved to give a unique
value for the solution at a given point (t, x), and, (b) what the qualtitative features and
dynamical behavior of the solution means.
More instructive and revealing is the following geometric approach. Through each
point (x, 0) on the x axis, draw the characteristic line whose slope f (x) equals the value
of the initial data. The solution u(t, x) will, by our computation, have the same value
u = f (x) on the entire line. In Figure Rsol we illustrate with an example. Now, unless all
the characteristic lines are parallel which means that they have the same slope, which
means that u has the same value on each one, which means that u c is a trivial constant
solution they must cross. When two characteristic lines cross, the solution u(t, x) is not
3/7/03

908

c 2003

Peter J. Olver

well-defined, since it is supposed to have two different values, one corresponding to each
line. So what happens?
We are primarily interested in the future, t > 0, and so there are two possibilities.
The first is if the initial data f (x) is nondecreasing, so f (x) f (x0 ) whenever x x0 .
This is assured if the derivative f 0 (x) 0 is never negative. In this case, the characteristic
lines emanting from the x axis fan out in the upper half plane t > 0, and so never cross;
see Figure chl . The solution is then well defined for all positive time, and represents a
wave of rarefaction, which gradually spreads out as time progresses. A typical example is
plotted in Figure rare , corresponding to initial data u(0, x) = tan 1 x.
The more interesting case is when f 0 (x) < 0. Now the characteristic lines starting at
t = 0 cross at some positive time t > 0. At this point, the mathematics breaks down, and
we do not know how to continue the solution as a single-valued function. One could, in
a theoretical framework, continue the solution as some form of multiply-valued function,
as in Figure multsol , but in a physical model this is not permitted. The solution u(t, x)
represents some physical quantity, e.g., velocity, pressure, etc., and so can only have one
physically meaningful value at each point.
The mathematics by itself cannot tell us how to continue with this solution,. We
therefore need to return to the physics underlying the partial differential equation, and
ask what sort of phenomenon we are trying to model. The most instructive is to view the
equation as a simple model of compressible fluid flow in a single space variable, e.g., gas
in a pipe. If we push a piston down the end of a long pipe then the gas will move ahead
of the piston and compress. If we push the piston too fast, the gas will compress near the
piston. However, if the piston moves too rapidly the gas piles up on top of itself and a
shock wave forms.
A similar phenomenon occurs with the solutions to the nonlinear Riemann equation.
Consider the initial data u(0, x) = 2 tan1 x, plotted in the first figure in Figure shu .
In the companion picture we plot the characteristic lines for this particular initial data.
Note that they all have poitive slope, and so all the points on the curve will move to the
right. Since the initial data is decreasing, the points to the right will move faster than
those to the left, and eventually overtake them. This does not happen instantaneously; it
takes a certain amount of time for the characteristic lines to cross. As they are getting
closer together, the solution u(t, x) for a fixed time t is steepening. At the moment of
crossing, the tangent to the curve has become vertical, and so ux (t, x? ) as t t? ,
the time of formation of the (first) shock.
The crossing time can be determined from the implicit solution formula (21.12). Indeed, if we differentiate with respect to x, we find

u
=
f (x u t) = f 0 ()
x
x

u
1t
x

where

= x ut

is the characteristic variable, which is constant along the characteristic lines. Solving,
f 0 ()
u
=
.
x
1 + t f 0 ()
3/7/03

909

c 2003

Peter J. Olver

Therefore, the slope


u

x

as

1
.
f 0 ()

In other words, if the initial data has negative slope at position x, so f 0 (x) < 0, then the
solution along the characteristic line emanating from the point (x, 0) will break down at
the time 1/f 0 (x). As a consequence, the first shock will appear at time

1
t? = min
< x < .
(21.15)
f 0 (x)
For instance, in the Figure shu , we have
f (x) =

tan1 x,
2

f 0 (x) =

1
,
1 + x2

and so the shock first appears at time


t? = min(1 + x2 ) = 1.
What happens after the shock time t? ? The mathematical solution steepens to vertical
as t t? and, afterwards, becomes a triply-valued function of x on some progressively
wider and wider interval; see Figure shu . Physically, one needs to choose which of the
three possible values should be used at a given point. The mathematics will not tell us the
answer, and we must reconsider the physical system that we are modeling.
The physical assumption that underlies the specification of where the shock wave
appears is known as an entropy condition. The simplest version, which applies to many
physical systems, is an equal area rule. Draw the vertical shock line where the areas of the
two lobes in the multiply valued solution are equal, as in Figure ea . For example,
Example 21.4. An interesting case is when the initial data has the form of a step
function with a single jump discontinuity:

a,
x < 0,
u(0, x) = f (x) = a + b (x) =
.
(21.16)
b,
x>0
If a > b > 0, then the initial data is in the form of a shock. If we use the mathematical
solution by continuing along the characteristic lines, the solution at time t is multiplyvalued in the region b t < x < a t where it assumes both values a and b as illustrated
in Figure sws . If we use the equal area rule, we draw the shock line halfway along, at
x = 21 (a + b) t. Therefore, the shock moves with speed 21 (a + b) equal to one half the
magnitude of the jump (and the value of the step function at the jump according to the
Fourier convention). Behind the shock the solution has value a and in front the smaller
value b. A graph of the characteristic lines appears in Figure swsch .
By way of contrast, if 0 < a < b, then the characteristic lines diverge from the
shock point, and the mathematical solution is not well-defined in the wedge-shaped region
a t < x < b t. we must decide how to connect the two regions where the solution is defined.
3/7/03

910

c 2003

Peter J. Olver

Physical reasoning points to using an affine solution, a simple modification of the solution
(21.13) to fit the two pieces into a continuous function. Note that
x
,
t
solves the differential equation; moreover, u(t, a t) = a, and u(t, b t) = b. Therefore, the
desired solution is

x a t,
a,
x/t,
a t x b t, ,
u(t, x) =

b,
x bt
which is graphed in Figure swsr . This solution is known as a rarefaction wave, whereas the
shock solution represents a wave of compression. Anyone caught in a traffic jam recognized
the compresssion waves, where the traffic is bunched together and almost stationery, while
the interspersed rarefaction waves represent freely moving traffic. (An intelligent drive wil
recognize the rarefaction waves moving through the jam and use them to switch lanes!)
The observed, and frustrating trafffic jam phenomenon is a direct result of the nonlinear
wave model for traffic flow.
u(t, x) =

The entropy condition allows us to progress beyond the formation of a simple shock.
As other characteristic lines cross, other shocks form. The shocks themselves can move at
different velocities, and when a fast-moving shock catches up with a slow moving shock,
one must decide how to merge the shocks together to retain a physically consistent solution.
At this point, the mathematics havs become too complicated for us to pursue in any more
detail, and we refer the reader to [123] for a detailed discussion, along with applications
to equations of gas dynamics, flood waves in rivers, motion of glaciers, chomotography,
traffic flow and many other physical systems.
Note that this implies irreversibility of the solutions to the nonlinear wave equation.
One cannot simply run time backwards and expect the shock to disappear. However, this
is a different issue than the irreversibility of the heat equation, which was due to its illposedness in backwards time. One can run the nonlinear wave equation backwards, but
this would result, typically, in the formation of a different collection of shocks.

21.2. Nonlinear Diffusion.


First order partial differential equations, beginning with the simple scalar equation
(21.10), and progressing through the equations of gas dynamics on to the full-blown Euler
equations of fluid mechanics, model conservative wave motion. Such models fail to account
for frictional and viscous effects. As in the linear heat equation, dissipative effects such
as friction and viscosity are governed by second order elliptic differential operators, and
hence intorduce second order terms into the wave model. In this section, we study the
very simplest model that includes both nonlinar wave motion and dissipation, known as
Burgers equation.
Burgers Equation
When a shock wave forms, there is a breakdown in the mathematical solution to the
equation. But the physical processes continue. This indicates that our assumptions gov3/7/03

911

c 2003

Peter J. Olver

erning the physical situation modeled by the partial differential equation are not complete,
and are neglecting certain significant physical effects. In the case of gas dynamics, the
nonlinear wave equation (21.10) does not build in any damping effects due to viscosity in
the fluid. Dissipative or frictional or viscous effects are, as we know, governed by second
order differential operators. The simplest is the linear heat equation which models a broad
range of dissipative phenomena, but fails to take into account nonlinear physical effects.
The simplest nonlinear diffusion equation is known as Burgers equation and takes
the form
ut = uxx + u ux .
(21.17)
The term uxx represents linear diffusion, as in the heat equation, and the diffusion coefficient > 0 must be positive in order that the equation be well-posed. One can also
interpret the linear term as a viscosity, and Burgers equation represents a very simplified
form of the equations of fluid mechanics with viscosity. The higher the viscosity , the
slower the diffusion. The nonlinear term represents a simple advection.
In the inviscid limit, as the diffusion term goes to zero, 0, Burgers equation
reduces to the nonlinear wave equation (21.10), which, as a result, is often referred to as
the inviscid Burgers equation in the literature.
The HopfCole Transformation
While the Burgers equation is a fully nonlinear partial differential equation, there
is a remarkable nonlinear transformation that converts it into the linear heat equation.
Although one can find this result in the nineteenth century textbook by Forsyth, [49; vol.
6, p. 102], its modern rediscovery by Eberhard Hopf, [67], and Julian Cole, [27], was a
milestone in the study of nonlinear partial differential equations, and is now named after
them.
To simplify the presentation, we shall set the diffusion coefficient = 1 and the
nonlinear coefficient = 2, and so work with the Burgers equation in a simplified form
ut = uxx + 2 u ux .

(21.18)

A straightforward rescaling argument can then be used to produce the solution to the
general version (21.17); see below.
We shall construct the HopfCole transformation in two stages. The first is to introduce a potential function for the solution u(t, x) to Burgers equation, which, since we are
dealing with a one-dimensional problem, merely means a function (t, x) whose spatial
derivative is the solution:

= u.
(21.19)
x
Thus, the potential function satisfies the partial differential equation
tx = xxx + 2 x xx ,

Note that the apostrophe goes after the s since the equation is named after the applied
mathematician J. Burgers.

3/7/03

912

c 2003

Peter J. Olver

which we write in the form


varphit xx 2x = 0.
x

We can integrate both sides with respect to x, and so

t xx 2x = h(t)

(21.20)

for some function h(t). However, since the potential is only defined up to an additive
function of t, we can set h(t) 0 without loss of generality, and thereby deduce the
potential Burgers equation
varphit = xx + 2x .
(21.21)
The potential form is still nonlinear. The second stage of the process is to make the
nonlinear change of variables
v = e ,
where v(t, x) is a new unknown function. We compute the corresponding derivatives via
the chain rule:

vt = t e ,
v x = x e ,
vxx = xx + 2x e .

Therefore, multiplying the potential Burgers equation (21.21) by e , we find that it reduces
to the heat equation
vt = vxx .
(21.22)
Thus, every solution (t, x) to the potential Burgers equation produces a solution v(t, x)
to the heat equation.
The converse is not completely valid, since to find
= log v,

(21.23)

we must assume that v(t, x) > 0 is positive solution to the heat equation. Fortunately,
the maximum principle tells us that if the initial data for the heat equation is positive,
so is the solution at all subsequent times, and so constructing positive solutions is not a
significant problem. Moreover, again assuming positivity, the derivative
u = x =

vx
v

(21.24)

is a solution to Burgers equation in its simplified form (21.18).


For example, the separable solution
v(t, x) = + e

sin x

leads to
To solve the initial value problem
u(0, x) = f (x)
3/7/03

913

c 2003

Peter J. Olver

for Burgers equation, we note that


Z x
(0, x) = F (x) =
dyf (y),

and hence

v(0, x) = H(x) = exp

Viscosity Solutions
First we put the parameters back into the equation. A simple rescaling tells us that
if u(t, x) solves (21.18), then
u
b(t, x) = u(t, x)

satisfies

and hence

u
bt = ut ,

Therefore, we should rescale to

u
bxx = 2 uxx ,

u
bx = ux ,

u
bt =

u
bxx + u
bu
bx .
2

u
b(t, x) = u

x
t,

(21.25)

to produce a solution to (21.17).


One way to formulate a shock solution to the Riemann equation is as the limit, as the
viscosity goes to zero, of classical solutions to the Burgers equation. It can be shown that
this limit is consistent with the equal area rule for drawing the shocks. More generally,
this viscosity solution method allows one to continue the solutions into the regimes where
multiple shocks merge and interact. Again, we refer the interested reader to [123] and to
[Visc].
Reaction diffusion?

21.3. Dispersion and Solitons.


Finally, we look at a remarkable third order evolution equation that serves to introduce
yet further phenomena, both linear and nonlinear. The simplest linear partial differential
equation of a type that we have not yet considered is the third order equation
ut = uxxx

(21.26)

It is the third member of the hierarchy of simple evolution equations that starts with the
simple ordinary differential equation ut = u, then proceeds to the unidirectional wave
equation ut = ux , and then the heat equation ut = uxx . Each member of the hierarchy has
its own range of properties. The third order case is a simple model for linear dispersive
waves.
We shall only look at the equation on the entire line, so x R, and so can ignore
additional complications caused by boundary conditions. The solution to the equation is
uniquely specified by initial data
u(0, x) = f (x),
3/7/03

914

< x < .
c 2003

Peter J. Olver

See [X] for a proof.


Let us apply the Fourier transform to solve the equation. Using separation of variables
Substitute
u(t, x) = e i t+ i k x
where is the frequency and k is called the wave number . We find
= k3
is the dispersion relation. Therefore, the solution is given by superposition as a Fourier
integral
Z
3
u(t, x) =
e i k t+ i k x fb(k) dk

In particular, the solution with a concentrated initial disturbance


u(0, x) = (x)
is

x
u(t, x) = Ai 1/3
t

in terms of the Airy function. See Figure ee3 for a graph.


Although energy is conserved, unlike the heat and diffusion equations, the dispersion
of waves means that the solution dies out.
group velocity and wave velocity.
The KortewegdeVries Equation
The simplest wave equation that combines dispersion with nonlinearity is the celebrated KortewegdeVries equation
ut + uxxx + u ux = 0.

(21.27)

The equation was first derived by the French applied mathematician Boussinesq, [18;
eq. (30), p. 77], [19; eqs. (283, 291)], in 1872 as a model for surface water waves. It was
rediscovered by the Dutch mathematicians Korteweg and de Vries, [78], over two decades
later. More recently, in the early 1960s, Kruskal and Zabusky, [129], rederived it as a continuum limit of a model of nonlinear mass-spring chains studied by Fermi, Pasta and Ulam,
[44]. Their numerical experiments on the eqation opened the door to the undertanding
of its many remarkable properties. It has a critical balance between nonlinear effects and
dispersion, leading to integrability. There is a remarkable transformation, known as the
inverse scattering transform, which is a form of nonlinear Fourier transform, that can be
used to solve the KortewegdeVries equation. Its fascinating properties continue to be of
great current research interest to this day.
The most important special solutions to the KortewegdeVries equation are the traveling waves. We assume that
u = v() = v(x c t)
3/7/03

915

c 2003

Peter J. Olver

to be a wave of permanent form, translating to the right with speed c. We also assume
that the wave is localized, meaning that u and its derivatives tend to 0 as | x | . Note
that
u
u
3u
= c v 0 (),
= v 0 (),
= v 000 ().
t
x
x3
Therefore, v() satsifies the third order nonlinear ordinary differential equation
v 000 + v v 0 c v = 0.

(21.28)

Moreover, we impose boundary conditions


lim

v() =

v 0 () =

lim

lim

v 00 () = 0.

(21.29)

This equation can be integrated. First, note that it can be written as a derivative:

d 00 1 2
v + 2 v c v = 0,
d

and hence

v 00 + 12 v 2 c v = a,

where a is a constant of integration. However, the boundary conditions as imply that


a = 0. Multiplying the latter equation by v 0 allows us to integrate a second time

d 1 0 2 1 3 1 2
(v ) + 6 v 2 c v = v 0 v 00 + 12 v 2 c v = 0.
2
d
Integrating both sides of the equation,
1 0 2
2 (v )

+ 16 v 3 21 c v 2 = b,

where b is a second constant of integration, which, again by the boundary conditions


(21.29), is also b = 0. Therefore, v() satisfies the first order autonomous ordinary differential equation
q
dv
= v c 13 v .
d
We integrate by the usual method:
Z

dv
1
3

= + .

c v

The solution has the form


v() = 3 c sech2
where
sech y =

1
2

c+ ,

1
2
= y
,
cosh y
e + ey

is the hyperbolic secant function. Hence, the localized traveling wave solutions of the
KortewegdeVries equation equation are of the form

u(t, x) = 3 c sech2 12 c (x c t) + ,
(21.30)
3/7/03

916

c 2003

Peter J. Olver

where c > 0 and are arbitrary constants. The parameter c measures the velocity of the
wave. It also masures its amplitude, since the maximum value of u(t, x) is 3 c since sech y
has a maximum value of 1 at y = 0. Therefore, the taller the wave, the faster it moves.
See Figure soliton for a graph.
The solution (21.30) is known as a solitary wave solution since it represents a localized
wave that travels unchanged in shape. Such waves were first observed by the British
engineer J. Scott Russell, [109], who tells the remarkable incident of chasing such a wave
generated by the sudden motion of a barge along an Edinburgh canal on horseback for
several miles. The mathematician Airy claimed that such waves could not exist, but he
based his analysis upon a linearized theory. Boussinesqs establishment of the surface wve
model demonstrated that such localized disturbances can result from nonlinear effects in
the system.
Remark : In the KortewegdeVries equation model, one can find arbitrarily tall soliton
solutions. In physical water waves, if the wave is too tall it will break. Indeed, it can be
rigorously proved that the full water wave equations admit solitary wave solutions, but
there is a wave of greatest height, beyond which a wave will tend to break. The solitary
water waves are not geniune solitons, since there is a small, but measureable, effect when
two waves collide.
These nonlinear traveling wave solutions were discovered by Kruskal and Zabusky,
[129], to have remarkable properties. For this reason they have been given a special new
name soliton. Ordinarily, combining two solutions to a nonlinear equation can be quite
unpredictable, and one might expect any number of scenarios to occur. If you start with
initial conditions representing a taller wave to the left of a shorter wave, the solution
of the KortewegdeVries equation runs as follows. The taller wave moves faster, and so
catches up the shorter wave. They then have a very complicated nonlinear interaction, as
expected. But, remarkably, after a while they emerge from the interaction unscathed. the
smaller wave is now in back and the larger one in front. After this, they proceed along
their way, with the smaller one lagging behind the high speed tall wave. the only effect
of their encounter is a phase shift, meaning a change in the value of the phase parameter
in each wave. See Figure solitons . After the interaction, the position of the soliton if
it had traveled unhindered by the other is shown in a dotted line. Thus, they behave like
colliding paricles, which is the genesis of the word soliton.
A similar phenomenon holds for several such soliton solutions. After some time where
the various waves interact, they finally emerge with the largest soliton in front, and then
in order to the smallest one in back, all progressing at their own speed, and so gradually
drawing apart.
Moreover, starting with an arbitrary initial disturbance
u(0, x) = f (x)
it can be proved that after some time, the solution disintegrates into a finite number of
solitons of different heights, moving off to the right, plus a small dispersive tail moving
to the left that rapidly disappears. Propving this remarkable result is beyond the scope
3/7/03

917

c 2003

Peter J. Olver

of this book. It relies on the method of inverse scattering, that connects the Korteweg
deVries equation with a linear eigenvalue problem of fundamental importance in onedimensional quantum mechanics. The solitons correspond to the bound states of a quantum
potential. We refer the interested reader to the introductory text [41] and the more
advanced monograph [1] for details.

21.4. Conclusion and Bon Voyage.


These are your first wee steps in a vast new realm. We are unable to discuss nonlinear
partial differential equations arising in fluid mechanics, in elasticity, in relativity, in differential geometry, n computer vision, in mathematical biology. We bid the reader adieu and
farewell.

3/7/03

918

c 2003

Peter J. Olver

Appendix A
Vector Calculus in Two Dimensions
so far, we have concentrated on problems of one-dimensional media bars, beams
and strings. In order to study the partial differential equations describing the equilibria
and dynamics of planar media, we need to review the basics of vector calculus in the two
dimensions. We begin with a discussion of plane curves and domains. Many physical
quantities, including force and velocity, are determined by vector fields, and we review the
basic concepts. The key differential operators in planar vector calculus are the gradient and
divergence operations, along with the Jacobian matrix for maps from R 2 to itself. There
are three basic types of line integrals: integrals with respect to arc length, for computing
lengths of curves, masses of wires, center of mass, etc., ordinary line integrals of vector
fields for computing work and fluid circulation, and flux line integrals for computing flux
of fluids and forces. Nex, we review the basics of double integrals of scalar functions over
plane domains. Line and double integrals are connected by the justly famous Greens
theorem, which is the two-dimensional version of the fundamental theorm of calculus. The
integration by parts argument required to characterize the adjoint of a partial differential
operator rests on the closely allied Greens formula.
Space limitations require us to go through this material fairly rapidly, and we assume
that you already gained sufficient familiarity with most of these concepts in a sophomorelevel multi-variable calculus course. More details, and full justifications of these results
can be found in many of the standard vector calculus texts, including [9].

A.1. Plane Curves.


We begin our review by collecting together the basic facts concerning geometry of
plane curves. A curve C R 2 is parametrized by a pair of continuous functions

x(t)
x(t) =
R2,
(A.1)
y(t)

where the scalar parameter t varies over an (open or closed) interval I R. When it
exists, the tangent vector to the curve at the point x is described by the derivative,

dx

x
(A.2)
=x= .
y
dt
We shall often use Newtons dot notation to abbreviate derivatives with respect to the
parameter t.
Physically, we can think of a curve as the trajectory described by a particle moving in
the plane. The parameter t is identified with the time, and so x(t) gives the position of the
3/7/03

919

c 2003

Peter J. Olver

Cusped Curve

Circle
Figure A.1.

Figure Eight

Planar Curves.

particle at time t. The p


tangent vector x(t) measures the velocity of the particle at time t;

its magnitude k x k = x2 + y 2 is the speed, while its orientation (assuming the velocity
is nonzero) indicates the instantaneous direction of motion of the particle as it moves
along the curve. Thus, by the orientation of a curve, we mean the direction of motion or
parametrization, as indicated by the tangent vector. Reversing the orientation amounts
to moving backwards along the curve, with the individual tangent vectors pointing in the
opposite direction.
The curve parametrized by x(t) is called smooth provided its tangent vector is con
tinuous and everywhere nonzero: x 6= 0. This is because curves with vanishing derivative
may have corners or cusps; a simple example is the first curve plotted in Figure A.1, which
has parametrization
2

t
2t
x(t) =
,
,
x(t) =
t3
3 t2

and has a cusp at the origin when t = 0 and x(0) = 0. Physically, a particle trajectory
remains smooth as long as the speed of the particle is never zero, which effectively prevents
the particle from instantaneously changing its direction of motion. A closed curve is smooth

if, in addition to satisfying x(t) 6= 0 at all points a t b, the tangents at the enpoints

match up: x(a) = x(b). A curve is called piecewise smooth if its derivative is piecewise
continuous and nonzero everywhere. The corners in a piecewise smooth curve have welldefined right and left tangents. For example, polygons, such as triangles and rectangles,
are piecewise smooth curves. In this book, all curves are assumed to be at least piecewise
smooth.
A curve is simple if it has no self-intersections: x(t) 6= x(s) whenever t 6= s. Physically,
this means that the particle is never in the same position twice. A curve is closed if x(t)
is defined for a t b and its endpoints coincide: x(a) = x(b), so that the particle ends
up where it began. For example, the unit circle
x(t) = ( cos t, sin t )

for

0 t 2 ,

Throughout this chapter, we always use the standard Euclidean inner product and norm.
With some care, all of the concepts can be adapted to other choices of inner product. In differential
geometry and relativity, one even allows the inner product and norm to vary from point to point,
[ 40 ].

3/7/03

920

c 2003

Peter J. Olver

is closed and simple , while the curve


x(t) = ( cos t, sin 2 t )

for

0 t 2 ,

is not simple since it describes a figure eight that intersects itself at the origin. Both curves
are illustrated in Figure A.1.

Assuming the tangent vector x(t) 6= 0, then the normal vector to the curve at the
point x(t) is the orthogonal or perpendicular vector

(A.3)
x =
x

of the same length k x k = k x k. Actually, there are two such normal vectors, the other

being the negative x . We will always make the right-handed choice (A.3) of normal,
meaning that as we traverse the curve, the normal always points to our right. If a simple
closed curve C is oriented so that it is traversed in a counterclockwise direction the
standard mathematical orientation then (A.3) describes the outwards-pointing normal.
If we reverse the orientation of the curve, then both the tangent vector and normal vector
change directions; thus (A.3) would give the inwards-pointing normal for a simple closed
curve traversed in the clockwise direction.
The same curve C can be parametrized in many different ways. In physical terms, a
particle can move along a prescribed trajectory at a variety of different speeds, and these
correspond to different ways of parametrizing the curve. Conversion from one paramee( ) is effected by a change of parameter , which is a smooth,
trization x(t) to another x
e( ) = x(g( )). We require
invertible function t = g( ); the reparametrized curve is then x
0
that dt/d = g ( ) > 0 everywhere. This ensures that each t corresponds to a unique
value of , and, moreover, the curve remains smooth and is traversed in the same overall
direction under the reparametrization. On the other hand, if g 0 ( ) < 0 everywhere, then
the orientation of the curve is reversed under the reparametrization. We shall use the
notation C to indicate the curve having the same shape as C, but with the reversed
orientation.
T

Example A.1. The function x(t) = ( cos t, sin t ) for 0 < t < parametrizes a
semi-circle of radius 1 centered at the origin. If we set = cot t then we obtain the less
evident parametrization
T

1
e( ) =
,
for
< <
x
1 + 2
1 + 2

of the same semi-circle, in the same direction. In the familiar parametrization, the velocity

vector has unit length, k x k 1, and so the particle moves around the semicircle in the
countercloskwise direction with unit speed. In the second parametrization, the particle

For a closed curve to be simple, we require x(t) 6= x(s) whenever t 6= s except at the ends,
where x(a) = x(b) is required for the ends to close up.

The minus sign is to ensure that d /dt > 0.

3/7/03

921

c 2003

Peter J. Olver

Interior Point

Bounded Domain
Figure A.2.

A Simple Closed Curve

Topology of Planar Domains.

slows down near the endpoints, and, in fact, takes an infinite amount of time to traverse
the semicircle from right to left.

A.2. Planar Domains.


A plate or other two-dimensional body occupies a region in the plane, known as a
domain. The simplest example is an open circular disk

Dr (a) = x R 2 k x a k < r
(A.4)
of radius r centered at a point a R 2 . In order to properly formulate the mathematical
tools needed to understand boundary value problems and dynamical equations for such
bodies, we first need to review basic terminology from point set topology of planar sets.
Many of the concepts carry over as stated to subsets of any higher dimensional Euclidean
space R n .
Let R 2 be any subset. A point a is called an interior point if some small
disk centered at a is entirely contained within the set: D (a) for some > 0; see
Figure A.2. The set is open if every point is an interior point. A set K is closed if and
only if its complement = R 2 \ K = {x 6 K} is open.

Example A.2. If f (x, y) is any continuous real-valued function, then the subset
{ f (x, y) > 0 } where f is strictly positive is open, while the subset { f (x, y) 0 } where f
is non-negative is closed. One can, of course, replace 0 by any other constant, and also
reverse the direction of the inequalities, without affecting the conclusions.
In particular, the set
Dr = { x 2 + y 2 < r 2 }
(A.5)
consisting of all points of (Euclidean) norm strictly less than r, defines an open disk of
radius r centered at the origin. On the other hand,
Kr = { x 2 + y 2 r 2 }

(A.6)

is the closed disk of radius r, which includes the bounding circle


Cr = { x2 + y 2 = r2 }.
3/7/03

922

(A.7)
c 2003

Peter J. Olver

Figure A.3.

Open Sets Defined by a Hyperbola.

A point x? is a limit point of a set if there exists a sequence of points x(n)


converging to it, so that x(n) x? as n . Every point x is a limit point (just
take all x(n) = x) but the converse is not necessarily valid. For example, the points on the
circle (A.7) are all limit points for the open disk (A.5). The closure of a set , written ,
is defined as the set of all limit points of . In particular, a set K is closed if and only if
it contains all its limit points, and so K = K. The boundary of a subset consists of
all limit points which are not interior points. If is open, then its closure is the disjoint
union of the set and its boundary = . Thus, the closure of the open disk D r is
the closed disk D r = Dr Cr ; the circle Cr = Dr = Dr forms their common boundary.
An open subset that can be written as the union, = 1 2 , of two disjoint,
nonempty, open subsets, so 1 2 = , is called disconnected . For example, the open
set
= { x2 y 2 > 1 }
(A.8)
is disconnected, consisting of two disjoint sectors bounded by the two branches of the
hyperbola x2 y 2 = 1; see Figure A.3. On the other hand, the complementary open set
b = { x2 y 2 < 1 }

(A.9)

is connected , and consists of all points between the two hyperbolas.


A subset is called bounded if it is contained inside a (possibly large) disk, i.e., D r
for some r > 0, as in the second picture in Figure A.2. Thus, both the closed and the open
disks (A.5), (A.6) are bounded, whereas the two hyperbolic sectors (A.8), (A.9) are both
unbounded.
The class of subsets for which the boundary value problems for the partial differential
equations of equilibrium mechanics are properly prescribed can now be defined.
Definition A.3. A planar domain is a connected, open subset R 2 whose boundary consists of one or more piecewise smooth, simple curves, such that lies entirely
on one side of each of its boundary curve(s).

See Section 11.5 for more details on convergence.

3/7/03

923

c 2003

Peter J. Olver

Rectangle

Annulus
Figure A.4.

Wedge

Planar Domains.

The last condition is to avoid dealing with pathologies. For example, the subset \ C
obtained by cutting out a curve C from the interior of an open set would not be an
allowable domain.
Example A.4. The open rectangle R = { a < x < b, c < y < d } is an open, connected and bounded domain. Its boundary is a piecewise smooth curve, since there are
corners where the tangent does not change continuously.
The annulus
r 2 < x2 + y 2 < R 2 ,

for fixed

0 < r < R,

(A.10)

is an open, connected, bounded domain whose boundary consists of two disjoint concentric
circles. The degenerate case of a punctured disk , when r = 0, is not a domain since its
boundary consists of a circle and a single point the origin.
Another well-studied example is the wedge-shaped domain W = { < < } consisting of all points whose angular coordinate = tan1 y/x lies between two prescribed
values. If 0 < < 2 , then the wedge is a domain whose boundary consists of two
connected rays. However, if = + 2 , then the wedge is obtained by cutting the plane
along a single ray at angle . The latter case does not comply with our definition of a
domain since the wedge now lies on both sides of its boundary ray.
Any connected domain is automatically pathwise connected meaning that any two
points can be connected by (i.e., are the endpoints of) a curve lying entirely within the
domain. If the domain is bounded, which is the most important case for boundary value
problems, then its boundary consists of one or more piecewise smooth, simple, closed
curves. A bounded domain is called simply connected if it has just one such boundary
curve; this means that is connected and has no holes, and so its boundary = C is
a simple closed curve that contains in its interior. For instance, an open disk and a
rectangle are both simply connected, whereas an annulus is not.
The Jordan Curve Theorem states the intuitively obvious, but actually quite deep,
result that any simple closed curve divides the plane R 2 into two disjoint, connected, open
domains its interior , which is bounded and simply connected, and its exterior , which
is unbounded and not simply connected. This result is illustrated in the final figure in
Figure A.2; the interior of the indicated simple closed curve is shaded in gray while the
3/7/03

924

c 2003

Peter J. Olver

Figure A.5.

Vector Fields.

exterior is in white. Note that the each subdomain lies entirely on one side of the curve,
which forms their common boundary.
The following result is often used to characterize the simple connectivity of more
general planar subsets, including unbounded domains.
Lemma A.5. A planar domain R 2 is simply connected if it is connected and,
moreover, if the interior of any simple closed curve C is also contained in .
For example, an annulus (A.10) is not simply connected because the interior of a circle
going around the hole is not entirely contained within the annulus. On the other hand, the
unbounded domain (A.9) lying between two branches of a hyperbola is simply connected,
even though its boundary consists of two disjoint, unbounded curves.

A.3. Vector Fields.

v1 (x, y)
A vector-valued function v(x, y) =
is known as a (planar) vector field .
v2 (x, y)
T
A vector field assigns a vector v(x, y) to each point ( x, y ) in its domain of definition,
and hence defines a (in general nonlinear) function v: R 2 . The vector field can be
conveniently depicted by drawing an arrow representing the vector v = v(x, y) starting at
T
its point of definition ( x, y ) . See Figure A.5 for some representative sketches.
Example A.6. Vector fields arise very naturally in physics and engineering applications from physical forces: gravitational, electrostatic, centrifugal, etc. A force field
T
f (x, y) = ( f1 (x, y), f2 (x, y) ) describes the direction and magnitude of the force experienced by a particle at position (x, y). In a planar universe, the gravitational force field
exerted by a point mass concentrated at the origin has, according to Newtonian gravitational theory, magnitude proportional to 1/r, where r = k x k is the distance to the origin,
and is directed towards the origin. Thus, the vector field describing gravitational force has

In three-dimensional Newtonian gravity, 1/r is replaced by 1/r 2 .

3/7/03

925

c 2003

Peter J. Olver

the form
x
=
f =
kxk

y
x
p
, p
2
2
x +y
x2 + y 2

!T

(A.11)

where > 0 denotes the constant of proportionality, namely the product of the two masses
times the universal gravitational constant. The same force law applies to the attraction,
> 0, and repulsion, < 0, of electrically charged particles.
Newtons Laws of planetary motion produce the second order system of differential
equations
d2 x
m 2 = f.
dt
The solutions x(t) describe the trajectories of planets subject to a central gravitational
force, e.g., the sun. They also govern the motion of electrically charged particles under a
central electric charge, e.g., classical (i.e., not quantum) electrons revolving around a central nucleus. In three-dimensional Newtonian mechanics, planets move along conic sections
ellipses in the case of planets, and parabolas and hyperbolas in the case of non-recurrent
objects like some comets. Interestingly (and not as well-known), the corresponding twodimensional theory is not as neatly described the typical orbit of a planet around a
planar sun does not form a simple closed curve, [37]!
Example A.7. Another important example is the velocity vector field v of a steadystate fluid flow. The vector v(x, y) measures the instantaneous velocity of the fluid particles
(molecules or atoms) as they pass through the point (x, y). Steady-state means that
the velocity at a point (x, y) does not vary in time even though the individual fluid
particles are in motion. If a fluid particle moves along the curve x(t) = (x(t), y(t)) T , then

its velocity at time t is the derivative v = x of its position with respect to t. Thus, for
T
a time-independent velocity vector field v(x, y) = ( v1 (x, y), v2 (x, y) ) , the fluid particles
will move in accordance with an autonomous, first order system of ordinary differential
equations
dy
dx
(A.12)
= v1 (x, y),
= v2 (x, y).
dt
dt
According to the basic theory of systems of ordinary differential equations , an individual
particles motion x(t) will be uniquely determined solely by its initial position x(0) = x 0 .
In fluid mechanics, the trajectories of particles are known as the streamlines of the flow.
The velocity vector v is everywhere tangent to the streamlines. When the flow is steady,
the streamlines do not change in time. Individual fluid particles experience the same
motion as they successively pass through a given point in the domain occupied by the
fluid.
As a specific example, consider the vector field

y
v(x, y) =
,
(A.13)
x

See Section 19.2 for details.

3/7/03

926

c 2003

Peter J. Olver

Rotational Flow

Source
Figure A.6.

Spiral Sink

Steady State Fluid Flows.

for fixed > 0, which is plotted in the first figure in Figure A.5. The corresponding fluid
trajectories are found by solving the associated first order system of ordinary differential
equations

x = y,
y = x,
with initial conditions x(0) = x0 , y(0) = y0 . This is a linear system, and can be solved by
the eigenvalue and eigenvector techniques presented in Chapter 8. The resulting flow
x(t) = x0 cos t y0 sin t,

y(t) = x0 sin t + y0 cos t,

corresponds to a fluid that is uniformly rotating around the origin. The streamlines are
concentric circles, and the fluid particles rotate around the circles in a counterclockwise
direction with angular velocity , as illustrated in Figure A.6. Note that the fluid velocity
v is everywhere tangent to the circles. The origin is a stagnation point, since the velocity
field v = 0 vanishes there, and the particle at the origin does not move.
As another example, the radial vector field

x
(A.14)
v(x, y) = x =
y
corresponds to a fluid source, > 0, or sink, < 0, at the origin, and is plotted in the
second figure in Figure A.5. The solution to the first order system of ordinary differential

equations x = x with initial conditions x(0) = x0 gives the radial flow x(t) = e t x0 . The
streamlines are the rays emanating from the origin, and the motion is outwards (source)
or inwards (sink) depending on the sign of . As in the rotational flow, the origin is a
stagnation point.
Combining the radial and circular flow vector fields,

x y
v(x, y) =
(A.15)
x + y
leads to a swirling source or sink think of the water draining out of your bathtub. Again,
the flow is found by integrating a linear system of ordinary differential equations

x = x y,
3/7/03

927

y = x + y.
c 2003

Peter J. Olver

Solving as in Chapter 8, we find that the fluid particles follow the spiral streamlines

y(t) = e t x0 sin t + y0 cos t ,


x(t) = e t x0 cos t y0 sin t ,

again illustrated in Figure A.6.

Remark : All of the phase portraits for linear systems of first order ordinary differential
equations in two variables presented in Section 8.7 can be reinterpreted as streamline plots
for steady state fluid flows. Additional, nonlinear examples, along with numerical solution
techniques, can be found in Chapter 19.
Remark : Of course, physical fluid motion occurs in three-dimensional space. However,
any planar flow can also be viewed as a particular type of three-dimensional fluid motion
that does not depend upon the vertical coordinate. The motion on every horizontal plane
is the same, and so the planar flow represents a cross-section of the full three-dimensional
motion. For example, slicing a steady flow past a vertical cylinder by a transverse horizontal
plane results in a planar flow around a circle; see Figure fcyl .

A.4. Gradient and Curl.


In the same vein, a scalar-valued function u(x, y) is often referred to as a scalar
T
field , since it assigns a scalar to each point ( x, y ) in its domain of definition. Typical
physical examples of scalar fields include temperature, deflection of a membrane, height of
a topographic map, density of a plate, and so on.
The gradient operator maps a scalar field u(x, y) to the vector field

u/x
u = grad u =
(A.16)
u/y
consisting of its two first order partial derivatives. The scalar field u is often referred to
as a potential function for its gradient vector field u. For example, the gradient of the
T
potential function u(x, y) = x2 + y 2 is the radial vector field u = ( 2 x, 2 y ) . Similarly,
the gradient of the logirathmic potential function
u(x, y) = log r = 21 log(x2 + y 2 )
is the gravitational force (A.11) exerted by a point mass concentrated at the origin. Additional physical examples include the velocity potential of certain fluid velocity vector fields
and the electromagnetic potential whose gradient describes the electromagnetic force field.
Not every vector field admits a potential because not every vector field lies in the
range of the gradient operator . Indeed, if u(x, y) has continuous second order partial
derivatives, and


v1
ux
= v = u =
,
v2
uy
then, by the equality of mixed partials,

v
u
u
v1
=
=
= 2.
y
y x
x y
x
3/7/03

928

c 2003

Peter J. Olver

The resulting equation


v1
v
= 2
y
x

(A.17)

constitutes one of the necessary conditions that a vector field must satisfy in order to be
a gradient. Thus, for example, the rotational vector field (A.13) does not satisfy (A.17),
and hence is not a gradient. There is no potential function for such circulating flows.
The difference between the two terms in (A.17) is known as the curl of the planar
vector field v = (v1 , v2 ), and denoted by
v = curl v =

v
v2
1.
x
y

(A.18)

Notice that the curl of a planar vector field is a scalar field. (In contrast, in three dimensions, the curl of a vector field is a vector field see (B.76).) Thus, a necessary condition
for a vector field to be a gradient is that its curl vanish identically: v 0.
Even if the vector field has zero curl, it still may not be a gradient. Interestingly, the
general criterion depends only upon the topology of the domain of definition, as clarified
in the following theorem.
Theorem A.8. Let v be a smooth vector field defined on a domain R 2 . If
v = u for some scalar function u, then v 0. If is simply connected, then the
converse holds: if v 0 then v = u for some potential function u defined on .
As we shall see, this result is a direct consequence of Greens Theorem A.25.
Example A.9. The vector field
v=

y
x
,
x2 + y 2 x2 + y 2

(A.19)

satisfies v 0. However, there is no potential function defined for all (x, y) 6= (0, 0)
such that u = v. As the reader can check, the angular coordinate
u = = tan1

y
x

(A.20)

satisfies = v, but is not well-defined on the entire domain since it experiences a jump
discontinuity of magnitude 2 as we go around the origin. Indeed, = {x 6= 0} is not
simply connected, and so Theorem A.8 does not apply. On the other hand, if we restrict
b that does not encircle the origin, then the
v to any simply connected subdomain
b and does
angular coordinate (A.20) can be unambiguously and smoothly defined on ,
serve as a single-valued potential function for v; see Exercise

In this text, we adopt the more modern wedge notation for what is often denoted by a
cross .

3/7/03

929

c 2003

Peter J. Olver

In fluid mechanics, the curl of a vector field measures the local circulation in the
associated steady state fluid flow. If we place a small paddle wheel in the fluid, then its
rate of spinning will be in proportion to v. (An explanation of this fact will appear
below.) The fluid flow is called irrotational if its velocity vector field has zero curl, and
hence, assuming is simply connected, is a gradient: v = u. In this case, the paddle
wheel will not spin. The scalar function u(x, y) is known as the velocity potential for
the fluid motion. Similarly, a force field that is given by a gradient f = is called a
conservative force field , and the function defines the force potential.
T
Suppose u(x) = u(x, y) is a scalar field. Given a parametrized curve x(t) = ( x(t), y(t) ) ,
the composition f (t) = u(x(t)) = u(x(t), y(t)) indicates the behavior as we move along
the curve. For example, if u(x, y) represents the elevation of a mountain range at position
(x, y), and x(t) represents our position at time t, then f (t) = u(x(t)) is our altitude at time
t. Similarly, if u(x, y) represents the temperature at (x, y), then f (t) = u(x(t)) measures
our temperature at time t.
The rate of change of the composite function is found through the chain rule
df
d
u dx
u dy

=
u(x(t), y(t)) =
+
= u x,
dt
dt
x dt
y dt

(A.21)

and hence equals the dot product between the gradient u(x(t)) and the tangent vector

x(t) to the curve at the point x(t). For instance, our rate of ascent or descent as we
travel through the mountains is given by the dot product of our velocity vector with the
gradient of the elevation function. The dot product between the gradient and a fixed vector
T
a = ( a, b ) is known as the directional derivative of the scalar field u(x, y) in the direction
a, and denoted by
u
= a u = a ux + b uy .
(A.22)
a
Thus, the rate of change of u along a curve x(t) is given by its directional derivative

u/ x = u x, as in (A.21), in the tangent direction. This leads us to one important


interpretation of the gradient vector.
Proposition A.10. The gradient u of a scalar field points in the direction of
steepest increase of u. The negative gradient, u, which points in the opposite direction,
indicates the direction of steepest decrease of u.
For example, if u(x, y) represents the elevation of a mountain range at position (x, y)
on a map, then u tells us the direction that is steepest uphill, while u points directly
downhill the direction water will flow. Similarly, if u(x, y) represents the temperature of
a two-dimensional body, then u tells us the direction in which it gets hottest the fastest.
Heat energy (like water) will flow in the opposite direction, namely in the direction of the
vector u. This basic fact underlies the derivation of the multi-dimensional heat and
diffusion equations.
You need to be careful in how you interpret Theorem 18.39. Clearly, the faster you
move along a curve, the faster the function u(x, y) will vary, and one needs to take this
into account when comparing the rates of change along different curves. The easiest way

to normalize is to assume that the tangent vector a = x has norm 1, so k a k = 1 and we


3/7/03

930

c 2003

Peter J. Olver

are going through x with unit speed. Once this is done, Theorem 18.39 is an immediate
consequence of the CauchySchwarz inequality (3.13). Indeed,

when
k a k = 1,
a = | a u | k a k k u k = k u k,

with equality if and only if a = c u points in the same direction as the gradient. Therefore,
the maximum rate of change is when a = u/k u k is the unit vector in the direction of
the gradient, while the minimum is achieved when a = u/k u k points in the opposite
direction. As a result, Theorem 18.39 tells us how to move if we wish to minimize a scalar
function as rapidly as possible.
Theorem A.11. A curve x(t) will realize the steepest decrease in the scalar field
u(x) if and only if it satisfies the gradient flow equation

x = u,

dx
u
=
(x, y),
dt
x
dy
u
=
(x, y).
dt
y

or

(A.23)

The only points at which the gradient does not tell us about the directions of increase/decrease are the critical points, which are, by definition, points where the gradient
vanishes: u = 0. These include local maxima or minima of the function, i.e., mountain
peaks or bottoms of valleys, as well as other types of critical points like saddle points that
represent mountain passes. In such cases, we must look at the second or higher order
derivatives to tell the directions of increase/decrease; see Section 18.3 for details.
Remark : Theorem A.11 forms the basis of gradient descent methods for numerically
approximating the maxima and minima of functions. One begins with a guess (x 0 , y0 )
for the minimum and then follows the gradient flow in to the minimum by numerically
integrating the system of ordinary differential equations (A.23). This idea will be developed
in detail in Chapter 18.
Example A.12. Consider the function u(x, y) = x2 + 2 y 2 . Its gradient vector field
T
is u = ( 2 x, 4 y ) , and hence the gradient flow equations (A.23) take the form

x = 2 x,

y = 4 y.
T

The solution that starts out at initial position ( x0 , y0 ) is


x(t) = x0 e 2 t ,

y(t) = y0 e 4 t .

(A.24)

Note that the origin is a stable fixed point for this linear dynamical system, and the
solutions x(t) 0 converge exponentially fast to the minimum of the function u(x, y). If
we start out not on either of the coordinate axes, so x0 6= 0 and y0 6= 0, then the trajectory
(A.24) is a semi-parabola of the form y = c x2 , where c = y0 /x20 ; see the first picture in
Figure A.7 . These curves, along with the four coordinate semi-axes, are the paths to
follow to get to the minimum 0 the fastest.
3/7/03

931

c 2003

Peter J. Olver

Figure A.7.

Orthogonal System of Ellipses and Parabolas.

Level Sets
Let u(x, y) be a scalar field. The curves defined by the implicit equation
u(x, y) = c

(A.25)

holding the function u(x, y) constant are known as its level sets. For instance, if u(x, y)
represents the elevation of a mountain range, then its level sets are the usual contour lines
on a topographic map. The Implicit Function Theorem tells us that, away from critical
points, the level sets of a planar function are simple, though not necessarily closed, curves.
Theorem A.13. If u(x, y) has continuous partial derivatives, and, at a point,
T
u(x0 , y0 ) 6= 0, then the level set passing through the point ( x0 , y0 ) is a smooth curve
near the point in question.
Critical points, where u = 0, are either isolated points, or points of intersection of
level sets. For example the level sets of the function u(x, y) = 3 x2 2 x3 + y 2 are plotted
T
T
in Figure ls . The function has critical points at ( 0, 0 ) and ( 0, 1 ) . The former is a
local minimum, and forms an isolated level point, while the latter is a saddle point, and is
the point of intersection of the level curves { u = 1 }.
T

If we parametrize an individual level set by x(t) = ( x(t), y(t) ) , then (A.25) tells
us that the composite function u(x(t), y(t)) = c is constant along the curve and hence its
derivative
d

u(x(t), y(t)) = u x = 0
dt

vanishes. We conclude that the tangent vector x to the level set is orthogonal to the
gradient direction u at each point. In this manner, we have established the following
additional imporant interpretation of the gradient, which is illustrated in Figure A.8.
Theorem A.14. The gradient u of a scalar field u is everywhere orthogonal to its
level sets { u = c }.
3/7/03

932

c 2003

Peter J. Olver

Figure A.8.

Level Sets and Gradient.

Comparing Theorems A.11 and A.14, we conclude that the curves of steepest descent
are always orthogonal (perpendicular) to the level sets of the function. Thus, if we want
to hike uphill the fastest, we should keep our direction of travel always perpendicular to
the contour lines. Similarly, if u(x, y) represents temperature in a planar body at position
(x, y) then the level sets are the curves of constant temperature, known as the isotherms.
Heat energy will flow in the negative gradient direction, and hence orthogonally to the
isotherms.
Example A.15. Consider again the function u(x, y) = x2 +2 y 2 from Example A.12.
Its level sets u(x, y) = x2 + 2 y 2 = c form a system of concentric ellipses centered at the
origin, illustrated in the second picture in Figure A.7. Theorem A.14 implies that the
parabolic trajectories (A.24) followed by the solutions to the gradient flow equations form
an orthogonal system of curves to the ellipses. This is evident in the third picture in
Figure A.7, showing that the ellipses and parabolas intersect everywhere at right angles.

A.5. Integrals on Curves.


As you know, integrals of scalar functions,

f (t) dt, are taken along real intervals

[ a, b ] R. In higher dimensional calculus, there are a variety of possible types of integrals.


The closest in spirit to one-dimensional integration are line integrals, in which one
integrates along a curve. In planar calculus, line integrals come in three flavors. The most
basic are the integrals of scalar functions with respect to arc length. Such integrals are used
to compute lengths of curves, and masses of one-dimensional objects like strings and wires.
The second and third varieties are used to integrate a vector field along a curve. Integrating
the tangential component of the vector field is used, for instance, to compute work and
measure circulation along the curve. The last type integrates the normal component of the

A more accurate term would be curve integral, but the terminology is standard and will
not be altered in this text.

3/7/03

933

c 2003

Peter J. Olver

vector field along the curve, and represents flux (fluid, heat, electromagnetic, etc.) along
the curve.
Arc Length
The length of the plane curve x(t) over the parameter range a t b is computed
by integrating the (Euclidean) norm of its tangent vector:

Z b
Z bp
dx

(A.26)
L(C) =
x2 + y 2 dt.
dt dt =
a
a

The formula is justified by taking the limit of sums of lengths of small approximating line
secgments, [9]. For example, if the curve is given as the graph of a function y = f (x) for
a x b, then its length is computed by the familiar calculus formula
2
Z bs
dy
L(C) =
dx.
(A.27)
1+
dx
a
It is important to verify that the length of a curve does not depend upon any particular
parametrization (or even direction of traversal) of the curve.

r cos t
Example A.16. The length of a circle x(t) =
, 0 t 2 , of radius r is
r sin t
given by

Z 2
Z 2
dx

r dt = 2 r,
L(C) =
dt dt =
0

verifying the well-known formula for its circumference. On the other hand, the curve

a cos t
x(t) =
,
0 t 2 ,
(A.28)
b sin t
parametrizes an ellipse with semi-axes a, b. Its arc length is given by the integral
Z 2 p
a2 sin2 t + b2 cos2 t dt.
s=
0

Unfortunately, this integral cannot be expressed in terms of elementary functions. It is, in


fact, an elliptic integral , [39], so named for this very reason!
T

A curve is said to be parametrized by arc length, written x(s) = ( x(s), y(s) ) , if one
traverses it with constant, unit speed, which means that

dx

(A.29)
ds = 1

Alternative norms lead to alternative notions of curve length, of importance in the study of
curved spaces in differential geometry. In Einsteins theory of relativity, one allows the norm to
vary from point to point, and hence length will vary over space.

3/7/03

934

c 2003

Peter J. Olver

Figure A.9.

The Moving Frame for an Ellipse.

at all points. In other words, the length of that part of the curve between arc length
parameter values s = s0 and s = s1 is exactly equal to s1 s0 . To convert from a more
general parameter t to arc length s = (t), we must compute

Z t
p
dx

and so
ds = k x k dt = x2 + y 2 dt.
(A.30)
s = (t) =
dt dt
a
The unit tangent to the curve at each point is obtained by differentiating with respect
to the arc length parameter:

dx
x
x
y
p
t=
= = p
,
so that
k t k = 1.
(A.31)
,

ds
kxk
x2 + y 2
x2 + y 2

(As always, we require x 6= 0.) The unit normal to the curve is orthogonal to the unit
tangent,
!

dx

dy
x
y
k n k = 1,
,
= p
,
so that
(A.32)
n = t =
, p

ds
ds
n t = 0.
x2 + y 2
x2 + y 2

At each point on the curve, the vectors t, n form an orthonormal basis of R 2 known as the
moving frame along the curve. For example, for the ellipse (A.28) with semi-axes a, b, the
unit tangent and normal are given by

1
1
a sin t
b cos t
,
n= 2
,
t= 2
b cos t
a + b2
a + b2 a sin t
and graphed in Figure A.9. Actually, a curve has two unit normals at each point one
points to our right side and the other to our left side as we move along the curve. The
normal n in (A.32) is the right-handed normal, and is the traditional one to choose; the
opposite, left-handed normal is its negative n. If we traverse a simple closed curve in
a counterclockwise direction, then the right-handed normal n is the unit outward normal,
pointing to the curves exterior.
3/7/03

935

c 2003

Peter J. Olver

Arc Length Integrals


We now explain how to integrate scalar functions along curves. Suppose first that C
is a (piecewise) smooth curve that is parametrized by arc length, x(s) = (x(s), y(s)) for
0 s `, where ` is the total length of C. If u(x) = u(x, y) is any scalar field, we define
its arc length integral along the curve C to be
Z

u ds =
C

(A.33)

u(x(s), y(s)) ds.


0

For example, if (x, y) represents the densityZ at position (x, y) of a wire bent in the shape
of a curve C, then the arc length integral
(x, y) ds computes the total mass of the
C

wire. In particular, the length of the curve is (tautologously) given by


L(C) =

ds =
C

ds = `.

(A.34)

If we use an alternative parametrization x(t), with a t b, then the arc length integral
is computed using the change of parameter formula (A.30), and so
Z

u ds =
C

b
a

2
Z b
dx
dy
dx 2

+
dt.
u(x(t))
dt =
u(x(t), y(t))

dt
dt
dt
a

(A.35)

Changing the orientation of the curve does not alter the value of this type of line integral.
Moreover, if we break up the curve into two nonoverlapping pieces, then the arc length
integral decomposes into a sum:
Z
Z
Z
Z
Z
u ds =
u ds,
u ds =
u ds +
u ds,
C = C 1 C2 .
(A.36)
C

C1

C2

Example A.17. A circular wire radius 1 has density proportional to the distance of
the point from the x axis. The mass of the wire is computed by the arc length integral
I

| y | ds =

2
0

| sin t | dt = 4.

The arc length integral was evaluated using the parametrization x(t) = ( a cos t, a sin t )

for 0 t 2 , whereby ds = k x k dt = dt.

Line Integrals of Vector Fields


There are two intrinsic ways of integrating a vector field along a curve. In the first
version, we integrate its tangential component v t, where t = dx/ds is the unit tangent
vector, with respect to arc length.
3/7/03

936

c 2003

Peter J. Olver

Definition A.18. The line integral of a vector field v along a parametrized curve
x(t) is given by
Z
Z
Z
v dx =
v1 (x, y) dx + v2 (x, y) dy =
v t ds.
(A.37)
C

To evaluate the line integral, we parametrize the curve by x(t) for a t b, and then

Z
Z b
Z b
dx
dy
dx
dt. (A.38)
dt =
v1 (x(t), y(t))
+ v2 (x(t), y(t))
v dx =
v(x(t))
dt
dt
dt
C
a
a
This result follows from the formulae (A.30), (A.31) for the arc length and unit tangent
vector. In general, line integrals are independent of how the curve is parametrized as
long as it is traversed in the same direction. Reversing the direction of parameterization,
i.e., changing the orientation of the curve, changes the sign of the line integral because
it reverses the direction of the unit tangent. As before, line integrals can be decomposed
into sums over components:
Z
Z
Z
Z
Z
v dx,
C = C 1 C2 .
v dx +
v dx =
v dx,
v dx =
C

C1

C2

(A.39)
In the second formula, one must take care to orient the two parts C1 , C2 in the same
direction as C.

Example A.19. Let C denote the circle of radius r centered at the origin. We will
compute the line integral of the rotational vector field (A.19), namely
I
I
y dx x dy
v dx =
.
x2 + y 2
C
C
The circle on the integral sign serves to remind us that we are integrating around a closed
curve. We parameterize the circle by
x(t) = r cos t,

y(t) = r sin t,

0 t 2 .

Applying the basic line integral formula (A.38), we find


I
Z 2
y dx x dy
r2 sin2 t r2 cos2 t
=
dt = 2 ,
x2 + y 2
r2
C
0
independent of the circles radius. Note that the parametrization goes around the circle
once in the counterclockwise direction. If we go around once in the clockwise direction,
e.g., by using the parametrization x(t) = (r sin t, r cos t), then the resulting line integral
equals + 2 .
If v represents the velocity vector field of a steady state fluid, then the line integral
(A.37) represents the circulation of the fluid around the curve. Indeed, v t is proportional
to the force exerted by the fluid in the direction of the curve, and so the circulation integral
measures the average of the tangential fluid forces around the curve. Thus, for example,
3/7/03

937

c 2003

Peter J. Olver

the rotational vector field (A.19) has a net circulation of 2 around any circle centered
at the origin. The minus sign tells us that the fluid is circulating in the clockwise direction
opposite to the direction in which we went around the circle.
A fluid flow is irrotational if the circulation is zero for all closed curves. An irrotational
flow will not cause a paddle wheel to rotate there will be just as much fluid pushing in
one direction as in the opposite, and the net tangential forces will cancel each other out.
The connection between circulation and the curl of the velocity vector field will be made
evident shortly.
If the vector field happens to be the gradient of a scalar field, then we can readily
evaluate its line integral.
Theorem A.20. If v = u is a gradient vector field, then its line integral
Z
u dx = u(b) u(a)

(A.40)

equals the difference in values between the potential function at the endpoints a = x(a)
and b = x(b) of the curve C.
Thus, the line integral of a gradient field is independent of path; its value does not
depend on how you get from point a to point b. In particular, if C is a closed curve, then
I
u dx = 0,
C

since the endpoints coincide: a = b. In fact, independence of path is both necessary and
sufficient for the vector field to be a gradient.
Theorem A.21. Let v be a vector field defined on a domain . Then the following
are equivalent:
Z
(a) The line integral
v dx is independent of path.
C
I
(b)
v dx = 0 for every closed curve C.
C

(c) v = u is the gradient of some potential function defined on .

In such cases, a potential function can be computed by integrating the vector field
Z x
v dx.
(A.41)
u(x) =
a

Here a is any fixed point (which defines the zero potential level), and we evaluate the
line integral over any curve that connects a to x; path-independence says that it does not
matter which curve we use to get from a to x. The proof that u = v is left as an exercise.
Example A.22. The line integral
Z
Z
v dx =
(x2 3 y) dx + (2 3 x) dy
C

3/7/03

938

c 2003

Peter J. Olver


T
of the vector field v = x2 3 y, 2 3 x
is independent of path. Indeed, parametrizing
a curve C by (x(t), y(t)), a t b, leads to

Z
Z b
dx
dy
2
2
(x 3 y) dx + (2 3 x) dy =
(x 3 y)
dt
+ (2 3 x)
dt
dt
C
a

Z b

3
b
d 3
=
x 3 x y + 2 y dt = x 3 x y + 2 y
.
dt
a
t=a
T

The result only depends on the endpoints a = ( x(a), y(a) ) , b = ( x(b), y(b) ) , and not
on the detailed shape of the curve. Integrating from a = 0 to b = (x, y) produces the
potential function
u(x, y) = x3 3 x y + 2 y.
As guaranteed by (A.41), u = v.
On the other hand, the line integral
Z
Z
v dx =
(x3 2 y) dx + x2 dy
C

is not path-independent, and so v does not admit a


of the vector field v = x3 2 y, x2
potential function. Indeed, integrating from (0, 0) to (1, 1) along the straight line segment
{ (t, t) | 0 t 1 }, produces
Z
Z 1
3

3
2
5
.
(x 2 y) dx + x dy =
t 2 t + t2 dt = 12
C

On the other hand, integrating along the parabola { (t, t2 ) | 0 t 1 }, yields a different
value
Z
Z 1
3

3
2
1
.
(x 2 y) dx + x dy =
t 2 t2 + 2 t3 dt = 12
C

If v represents a force field, then the line integral (A.37) represents the amount of
work required to move along the given curve. Work is defined as force, or, more correctly,
the tangential component of the force in the direction of motion, times distance. The
line integral effectively totals up the infinitesimal contributions, the sum total representing
the total amount of work expended in moving along the curve. Note that the work is
independent of the parametrization of the curve. In other words (and, perhaps, counterintuitively ), the amount of work expended doesnt depend upon how fast you move along
the curve.
According to Theorem A.21, the work does not depend on the route you use to get
from one point to the other if and only if the force field admits a potential function:
v = u. Then, by (A.40), the work is just the difference in potential at the two points. In

The reason this doesnt agree with our intuition about work is that we are not taking frictional
effects into account, and these are typically velocity-dependent.

3/7/03

939

c 2003

Peter J. Olver

particular, for a gradient vector field there is no net work required to go around a closed
path.
Flux
The second type of line integral is found by integrating the normal component of the
vector field along the curve:
Z
v n ds.
(A.42)
C

Using the formula (A.32) for the unit normal, we find that the inner product can be
rewritten in the alternative form
v n = v1

dy
dx
v2
= v t ,
ds
ds

where t = dx/ds is the unit tangent, while


v = ( v 2 , v1 )

(A.43)
T

is a vector field that is everywhere orthogonal to the velocity vector field v = ( v 1 , v2 ) .


Thus, the normal line integral (A.42) can be rewritten as a tangential line integral
Z
Z
Z
Z
Z

v t ds.
v dx =
v dx =
v1 dy v2 dx =
v n ds =
(A.44)
C

If v represents the velocity vector field for a steady-state fluid flow, then the inner
product v n with the unit normal measures the flux of the fluid flow across the curve at
the given point. The flux is positive if the fluid is moving in the normal direction n and
negative if it is moving in the opposite direction. If the vector field admits a potential,
v = u, then the flux
u
v n = u n =
(A.45)
n
equals its normal derivative, i.e., the directional derivative
of the potential function u in
Z
the normal direction to the curve. The line integral
v n ds sums up the individual
C

fluxes, and so represents the total flux across the curve, meaning the total volume of fluid
that passes across the curve per unit time in the direction assigned by the unit normal
n. In particular, if C is a simple closed curve and n is the outward normal, then the flux
integral (A.42) measures the net outflow of fluid across C; if negative, it represents an
inflow. The total flux is zero if and only if the total amount of fluid contained within the
curve does not change. Thus, in the absence of sources or sinks, an incompressible fluid,
such as water, will have zero net flux around any closed curve since the total amount of
fluid within any given region cannot change.
Example A.23. For the radial vector field

x
,
we have
v=x=
y
3/7/03

940

v =

y
.
x
c 2003

Peter J. Olver

Figure A.10.

Double Integration Domain.

As we saw in Example A.7, v represents the fluid flow due to a source at the origin. Thus,
the resulting fluid flux across a circle C of radius r is computed using the line integral
I

v n ds =

x dy y dx =

r2 sin2 t + r2 cos2 t dt = 2 r 2 .

Therefore, the source fluid flow has a net outflow of 2 r 2 units across a circle of radius r.
This is not an incompressible flow!

A.6. Double Integrals.


We assume that the student is familiar with the foundations of multiple integration,
and merely review a few of the highlights in this section. Given a scalar function u(x, y)
defined on a domain , its double integral
ZZ
ZZ
u(x, y) dx dy =
u(x) dx
(A.46)

is equal to the volume of the solid lying underneath the graph of u over . If u(x, y)
represents the density at position (x, y)T in a plate having the shape of the domain ,
then the double integral (A.46) measures the total mass of the plate. In particular,
ZZ
area =
dx dy

is equal to the area of the domain .


In the particular case when
=

(x) < y < (x),

a<x<b

(A.47)

is given as the region lying between the graphs of two functions, as in Figure A.10, then
we can evaluate the double integral by repeated scalar integration,
!
ZZ
Z Z
b

(x)

u(x, y) dx dy =

3/7/03

u(x, y) dy

941

dx,

(A.48)

(x)

c 2003

Peter J. Olver

in the two coordinate directions. Fubinis Theorem states that one can equally well evaluate
the integral in the reverse order
!
ZZ
Z d Z (y)
u(x, y) dx dy =
u(x, y) dx dy
(A.49)

in the case
=

(y)

(y) < x < (y),

c<y<d

lies between the graphs of two functions of y.

(A.50)

Example A.24. Compute the volume of the solid lying under the positive part of
the paraboloid z = 1 x2 y 2 . Note that z > 0 if and only if x2 + y 2 < 1, and hence we
should evaluate the double integral
ZZ
(1 x2 y 2 ) dx dy

over the unit disk = x + y < 1 . We may represent the disk in the form (A.47), so
that
p

p
= 1 x2 < y < 1 x2 , 1 < x < 1 .
2

Therefore, we evaluate the volume by repeated integration


#
ZZ
Z 1 " Z 1x2
(1 x2 y 2 ) dy dx
[ 1 x2 y 2 ] dx dy =

(y x y y )
2

1
3

1x2

1x2

y = 1x2

dx =

4
3 (1

x2 )3/2 dx =

1
2

The final integral is most easily effected via a trigonometric substitution.


Alternatively, and much easier, one can use polar coordinates to evaluate the integral.
The unit disk takes the form D = { 0 r < 1, 0 < 2 }, and so

ZZ
ZZ
Z 1 Z 2
2
2
2
3
(1 x y ) dx dy =
(1 r ) r dr d =
(r r ) d dr = 21 .
D

We are using the standard formula

dx dy = r dr d

(A.51)

for the area element in polar coordinates, [9].


The polar integration formula (A.51) is a consequence of the general change of variables
formula for double integrals. If
x = x(s, t),

y = y(s, t),
T

is an invertible change of variables that maps ( s, t ) D to ( x, y ) , then

ZZ
ZZ
(x, y)

.
u(x, y) dx dy =
stU (s, t)

(s,
t)

D
3/7/03

942

c 2003

(A.52)

Peter J. Olver

Here U (s, t) = u(x(s, t), y(s, t)) denotes the function when rewritten in the new variables,
while

x y x y
(x, y)
xs xt
=
= det

(A.53)
ys yt
(s, t)
s t
t s
is the Jacobian determinant of the functions x, y with respect to the variables s, t, which
measures the local change in area under the map.
In the event that the domain of integration is more complicated than either (A.47) or
(A.50), then one performs surgery by chopping up the domain
= 1 2 k

into smaller pieces. The pieces i are not allowed to overlap, and so have at most their
boundary curves in common. The double integral
ZZ
ZZ
ZZ
u(x, y) dx dy
(A.54)
u(x, y) dx dy + +
u(x, y) dx dy =

can then be evaluated as a sum of the double integrals over the individual pieces.

A.7. Greens Theorem.


For double integrals, the role of the Fundamental Theorem of Calculus is played by
Greens Theorem. The Fundamental Theorem relates an integral over an interval I = [ a, b ]
to an evaluation at the boundary I = { a, b }, which consists of the two endpoints of the
interval. In a similar manner, Greens Theorem relates certain double integrals over a
planar domain to line integrals around its boundary curve(s) .
Theorem A.25. Let v(x) be a smooth vector field defined on a bounded domain
R 2 . Then the line integral of v around the boundary equals the double integral of
the curl of v over the domain. This result can be written in either of the equivalent forms

ZZ
I
ZZ
I
v2
v1
v dx =
v dx,

dx dy =
v1 dx + v2 dy .
x
y

(A.55)
An outline of the proof appears in Exercise . Greens Theorem was first formulated
in 1828 by the English mathematician and miller George Green, and, contemporaneously,
by the Russian mathematician Mikhail Ostrogradski.
Example A.26. Let us apply Greens Theorem A.25 to the particular vector field
T
v = ( 0, x ) . Since v 1, we find
I
ZZ
x dy =
dx dy = area .

This means that we can compute the area of a planar domain by computing the indicated
line integral around its boundary! For example, to compute the area of a disk D r of radius
T
r, we parametrize its bounding circle Cr by ( r cos t, r sin t ) for 0 t 2 , and compute
I
Z 2
area Dr =
x dy =
r2 cos2 t dt = r 2 .
Cr

3/7/03

943

c 2003

Peter J. Olver

If we interpret v as the velocity vector field associated with a steady state fluid flow,
then the right hand side of formula (A.55) represents the circulation of the fluid around
the boundary of the domain . Greens Theorem implies that the double integral of the
curl of the velocity vector must equal this circulation line integral.
According to (mean2 ), if we divide the double integral in (A.55) by the area of the
domain,
ZZ
1
v dx = M [ v ] ,
area

we obtain the mean of the curl v of the vector field over the domain. In particular,
if the domain is very small, then v does not vary much, and so its value at any
point in the domain is more or less equal to the mean. On the other hand, the right hand
side of (A.55) represents the circulation around the boundary . Thus, we conclude that
the curl v of the velocity vector field represents an infinitesimal circulation at the
point it is evaluated. In particular, the fluid is irrotational, with no net circulation around
any curve, if and only if v 0 everywhere. Under the assumption that its domain of
definition is simply connected, Theorem A.21 tell us that this is equivalent to the existence
of a velocity potential u with u = v.
We can also apply Greens Theorem A.25 to flux line integrals of the form (A.42).
Using the identification (A.44) followed by (A.55), we find that
I
I
ZZ

v n ds =
v dx =
v dx dy.

However, note that the curl of the orthogonal vector field (A.43), namely

v
v1
+ 2 = v,
(A.56)
x
y
coincides with the divergence of the original velocity field. Combining these together, we
have proved the divergence or flux form of Greens Theorem:
ZZ
I
v dx dy =
v n ds.
(A.57)
v =

As before, is a bounded domain, and n is the unit outward normal to its boundary .
In the fluid flow interpretation, the right hand side of (A.57) represents the net fluid
flux out of the region . Thus, the double integral of the divergence of the flow vector
must equal this net change in area. Thus, in the absence of sources or sinks, the divergence
of the velocity vector field, v will represent the local change in area of the fluid at each
point. In particular, if the fluid is incompressible if and only if v 0 everywhere.
An ideal fluid flow is both incompressible, v = 0, and irrotational, v = 0.
Assuming its domain is simply connected, we introduce velocity potential u(x, y), so u =
v. Therefore,
0 = v = u = uxx + uyy .
(A.58)
Therefore, the velocity potential for an incompressible, irrotational fluid flow is a harmonic
function, i.e., it satisfies the Laplace equation! Water waves are typically modeled in
this manner, and so many problems in fluid mechanics rely on the solution to Laplaces
equation.
3/7/03

944

c 2003

Peter J. Olver

Appendix B
Vector Calculus in Three Dimensions
Before continuing on to the analysis of partial differential equations in three space
dimensions, we should first review the fundamentals of three-dimensional vector calculus.
The student is expected to have already encountered most of these topics in an introductory multi-variable calculus course. We shall be dealing with calculus on curves, surfaces
and solid bodies in three-dimensional space. The three methods of integration line,
surface and volume (triple) integrals and the fundamental vector differential operators
gradient, curl and divergence are intimately related. The differential operators and
integrals underly the multivariate versions of the fundamental theorem of calculus, known
as Stokes Theorem and the Divergence Theorem.
All of these topics will be reviewed in rapid succession, with most details being relegated to the exercises. A more detailed development can be found in any reasonable
multi-variable calculus text, including [9, 30, 48].

B.1. Dot and Cross Product.


We begin by reviewing the basic algebraic operations between vectors in three-dimensional space R 3 . We shall continue to use column vector notation

v1
T
v = v2 = ( v 1 , v 2 , v 3 ) R 3 .
v3

The standard basis vectors of R 3 are




1
0
e1 = i = 0 ,
e 2 = j = 1 ,
0
0


0
e 3 = k = 0 .
1

(B.1)

We prefer the former notation, as it easily generalizes to n-dimensional space. Any vector

v1
v = v2 = v 1 e1 + v 2 e2 + v 3 e3
v3

is a linear combination of the basis vectors. The coefficients he v1 , v2 , v3 are the coordinates
of the vector with respect to the standard basis.
3/7/03

945

c 2003

Peter J. Olver

Space comes equipped with an orientation either right- or left-handed. One cannot
alter the orientation by physical motion, although looking in a mirror or, mathematically, performing a reflection reverses the orientation. The standard basis vectors are
graphed with a right-hand orientation, as in Figure rhr . When you point with your right
hand, e1 lies in the direction of your index finger, e2 lies in the direction of your middle
finger, and e3 is in the direction of your thumb. In general, a set of three linearly independent vectors v1 , v2 , v3 is said to have a right-handed orientation if they have the same
orientation as the satandard basis. It is not difficult to prove that this is the case if and
only if the determinant of the 3 3 matrix whose columns are the given vectors is positive:
det ( v1 , v2 , v3 ) > 0. Interchanging the order of the vectors may switch their orientation;
for example if v1 , v2 , v3 are right-handed, then v2 , v1 , v3 is left-handed.
We have already made extensive use of the Euclidean dot product


w1
v1
(B.2)
w = w2 ,
v w = v 1 w1 + v 2 w2 + v 3 w3 ,
where
v = v2 ,
w3
v3
along with the Euclidean norm

kvk =

p
p
v v = v12 + v22 + v32 .

(B.3)

As in Definition 3.1, the dot product is bilinear, symmetric: v w = w v, and positive.


The CauchySchwarz inequality
| v w | k v k k w k.

(B.4)

implies that the dot product can be used to measure the angle between the two vectors
v and w:
v w = k v k k w k cos .
(B.5)
(See also (3.12).)
Remark : In this chapter, we will only use the Euclidean dot product and its associated norm. Adapting the constructions to more general norms and inner products is an
interesting exercise, but will not concern us here.
Also of great importance but particular to three-dimensional space is the cross
product between vectors. While the dot product produces a scalar, the three-dimensional
cross product produces a vector, defined by the formula

w1
v1
v2 w3 v 3 w2

(B.6)
w = w2 ,
where
v = v2 ,
v w = v3 w1 v 1 w3
w3
v3
v1 w2 v 2 w1
This assumes that space is identified with the three-dimensional Euclidean space R 3 , or,
more generally, an oriented three-dimensional manifold, [ dg ].

3/7/03

946

c 2003

Peter J. Olver

We have chosen to employ the more modern wedge notation rather the more traditional
cross symbol, v w, for this quantity. The cross product formula is most easily memorized
as a formal 3 3 determinant

v1 w1 e1
v w = det v2 w2 e2 = (v2 w3 v3 w2 ) e1 + (v3 w1 v1 w3 ) e2 + (v1 w2 v2 w1 ) e3 ,
v3 w3 e3
(B.7)
involving the standard basis vectors (B.1). We note that, like the dot product, the cross
product is a bilinear function, meaning that
(c u + d v) w = c (u w) + d (v w),

u (c v + d w) = c (u v) + d (u w),
(B.8)
3
for any vectors u, v, w R and any scalars c, d R. On the other hand, unlike the dot
product, the cross product is an anti-symmetric quantity
v w = w v,

(B.9)

which changes its sign when the two vectors are interchanged. In particular, the cross
product of a vector with itself is automatically zero:
v v = 0.
Geometrically, the cross product vector u = v w is orthogonal to the two vectors v
and w:
v (v w) = 0 = w (v w).

Thus, when v and w are linearly independent, their cross product u = v w 6= 0 defines
a normal direction to the plane spanned by v and w. The direction of the cross product
is fixed by the requirement that v, w, u = v w form a right-handed triple. The length
of the cross product vector is equal to the area of the parallelogram defined by the two
vectors, which is
k v w k = k v k k w k | sin |
(B.10)

where is than angle between the two vectors, as in Figure para . Consequently, the cross
product vector is zero, v w = 0, if and only if the two vectors are collinear (linearly
dependent) and hence only span a line.
The scalar triple product u (v w) between three vectors u, v, w is defined as the
dot product between the first vector with the cross product of the second and third vectors. The parenthesis is often omitted because there is only one way to make sense of
u v w. Combining (B.2), (B.7), shows that one can compute the triple product by the
determinantal formula

u1 v 1 w 1
u v w = det u2 v2 w2 .
(B.11)
u3 v 3 w 3
By the properties of the determinant, permuting the order of the vectors merely changes
the sign of the triple product:
u v w = v u w = +v w u = .
3/7/03

947

c 2003

Peter J. Olver

The triple product vanishes, u v w = 0, if and only if the three vectors are linearly
dependent, i.e., coplanar or collinear. The triple product is positive, u v w > 0 if and
only if the three vectors form a right-handed basis. Its magnitude | u v w | measures
the volume of the parallellopiped spanned by the three vectors u, v, w, as in Figure ppp .

B.2. Curves.
A space curve C R 3 is parametrized by a vector-valued function

x(t)
a t b,
x(t) = y(t) R 3 ,
z(t)

(B.12)

that depends upon a single parameter t that varies over some interval. We shall always
assume that x(t) is continuously differentiable. The curve is smooth provided its tangent
vector is continuous and everywhere nonzero:

x
dx

(B.13)
= x = y 6= 0.

dt
z

As in the planar situation, the smoothness condition (B.13) precludes the formulation of
corners, cusps or other singularities in the curve.
Physically, we can think of a curve as the trajectory described by a particle moving in

space. At each time t, the tangent vector x(t) represents the instantaneous
of the
p velocity

2
2
particle. Thus, as long as the particle moves with nonzero speed, k x k = x + y + z 2 >
0, its trajectory is necessarily a smooth curve.
Example B.1. A charged particle in a constant magnetic field moves along the curve

cos t
(B.14)
x(t) = sin t ,
ct

where c > 0 and > 0 are positive constants. The curve describes a circular helix of radius
spiraling up the z axis. The parameter c determines the pitch of the helix, indicating how
tightly its coils are wound; the smaller c is, the closer the winding. See Figure helix for
an illustration. DNA is, remarkably, formed in the shape of a (bent and twisted) double
helix. The tangent to the helix at a point x(t) is the vector

sin t

x(t) = cos t .
c

Note that the speed of the particle,


q
p

k x k = 2 sin2 t + 2 cos2 t + c2 = 2 + c2 ,

(B.15)

remains constant, although the velocity vector x twists around.


3/7/03

948

c 2003

Peter J. Olver

Figure B.1.

Two Views of a Trefoil Knot.

Most of the terminology introduced in Chapter A for planar curves carries over to
space curves without significant alteration. In particular, a curve is simple if it never
crosses itself, and closed if its ends meet, x(a) = x(b). In the plane, simple closed curves
are all topologically equivalent, meaning one can be continuously deformed to the other.
In space, this is no longer true. Closed curves can be knotted, and thus have nontrivial
topology.
Example B.2. The curve

(2 + cos 3 t) cos 2 t
x(t) = (2 + cos 3 t) sin 2 t
sin 3 t

for

0 t 2 ,

(B.16)

describes a closed curve that is in the shape of a trefoil knot, as depicted in Figure B.1.
The trefoil is a geniune knot, meaning it cannot be deformed into an unknotted circle
without cutting and retying. (However, a rigorous proof of this fact is not easy.) The
trefoil is the simplest of the toroidal knots, investigated in more detail in Exercise .
The study and classification of knots is a subject of great historical importance. Indeed, they were first considered from a mathematical viewpoint in the nineteenth century, when the English applied mathematician William Thompson (later Lord Kelvin),
[Kelvin], proposed a theory of atoms based on knots! In recent years, knot theory has
witnessed a tremendous revival, owing to its great relevance to modern day mathematics
and physics. We refer the interested reader to the advanced text [74] for details.

B.3. Line Integrals.


In Section A.5, we encountered three different types of line integrals along plane
curves. Two of these integrals with respect to arc length, (A.35), and circulation
integrals, (A.37) are directly applicable to space curves. On the other hand, for threedimensional flows, the analog of the flux line integral (A.42) is a surface integral, and will
be discussed later in the chapter.
3/7/03

949

c 2003

Peter J. Olver

Arc Length
The length of the space curve x(t) over the parameter range a t b is computed
by integrating the norm of its tangent vector:

Z b
Z bp
dx

L(C) =
(B.17)
x2 + y 2 + z 2 dt .
dt dt =
a
a

It is not hard to show that the length of the curve is independent of the parametrization
as it should be.
Starting at the endpoint x(a), the arc length parameter s is given by

Z t
p
dx

and so
ds = k x k dt = x2 + y 2 + z 2 dt.
(B.18)
s=
dt dt
a

The arc length s measures the distance along the curve starting from the initial point x(a).
Thus, the length of the part of the curve between s = and s = is exactly . It
is often convenient to reparametrize the curve by its arc length, x(s). This has the same
effect as moving along the curve at unit speed, since, by the chain rule,

dx
dx
dx dt
x

so that
=
= ,
ds = 1.
ds
dt ds
kxk

Therefore dx/ds is the unit tangent vector pointing in the direction of motion along the
curve.
Example B.3. The length of one turn of a helix (B.14) is, using (B.15),

Z 2
Z 2 p
p
dx
2 + c2 dt = 2

L(C) =

2 + c 2 .
dt
=
dt
0
0
T

The arc length parameter, measured from the point x(0) = ( r, 0, 0 ) is merely a rescaling,
Z tp
p
2 + c2 dt = 2 + c2 t,
s=
0

of the original parameter t. When the helix is parametrized by arc length,


!T

s
cs
s
, sin p
, p
x(s) = cos p
,
2 + c 2
2 + c 2
2 + c 2
p
we move along it with unit speed. It now takes time s = 2 2 + c2 to complete on turn
of the helix.
Example B.4. To compute the length of the trefoil knot (B.16), we begin by computing the tangent vector

2 (2 + cos 3 t) sin 2 t 3 sin 3 t cos 2 t


dx
=
2 (2 + cos 3 t) cos 2 t 3 sin 3 t sin 2 t .
dt
3 cos 3 t
3/7/03

950

c 2003

Peter J. Olver

After some algebra involving trigonometric identities, we find

k x k = 27 + 16 cos 3 t + 2 cos 6 t ,
which is never 0. Unfortunately, the resulting arc length integral
Z 2
Z 2

27 + 16 cos 3 t + 2 cos 6 t dt
k x k dt =
0

cannot be completed in elementary terms. Numerical integration can be used to find the
approximate value 31.8986 for the length of the knot.
The arc length integral of a scalar field u(x) = u(x, y, z) along a curve C is
Z
Z `
Z `
u ds =
u(x(s)) ds =
u(x(s), y(s), z(s)) ds,
C

(B.19)

where ` is the total length of the curve. For example, if (x, y, z) represents
Z the density at
position x = (x, y, z) of a wire bent in the shape of the curve C, then
ds represents
C

the total mass of the wire. In particular, the integral


Z
Z `
ds =
ds = `
C

recovers the length of the curve.


If it is not convenient to work directly with the arc length parametrization, we can still
compute the arc length integral in terms of the original parametrization x(t) for a t b.
Using the change of parameter formula (B.18), we find
Z
Z b
Z b
p

(B.20)
u ds =
u(x(t)) k x k dt =
u(x(t), y(t), z(t)) x2 + y 2 + z 2 dt.
C

Example B.5. The density of a wire that is wound in the shape of a helix is
proportional to its height. Let us compute the mass of one full turn of the helical wire.
Thus, the density is given by (x, y, z) = a z, where a is the constant of proportionality,
and we are assuming z 0. Substituting into (B.20), the total mass of the wire is
Z
Z 2
p
p
L(C) =
a z ds =
a c t r2 + c2 dt = 2 2 a c r2 + c2 .
C

Line Integrals of Vector Fields


As in the two-dimensional situation (A.37), the line integral of a vector field v along a
parametrized curve x(t) is obtained by integration of its tangential component with respect
to the arc length. The tangential component of v is given by
v t,
3/7/03

where
951

t=

dx
ds
c 2003

Peter J. Olver

is the unit tangent vector to the curve. Thus, the line integral of v is written as
Z
Z
Z
v dx =
v1 (x, y, z) dx + v2 (x, y, z) dy + v3 (x, y, z) dz =
v t ds.
C

(B.21)

We can evaluate the line integral in terms of an arbitrary parametrization of the curve by
the general formula
Z

v dx =
=

b
a

b
a

v(x(t))

dx
dt
dt

(B.22)

dy
dz
dx
+ v2 (x(t), y(t), z(t))
+ v3 (x(t), y(t), z(t))
v1 (x(t), y(t), z(t))
dt
dt
dt

dt.

Line integrals in three dimensions enjoy all of the properties of their two-dimensional
siblings: Reversing the direction of parameterization along the curve changes the sign;
also, the integral can be decomposed into sums over components:
Z
Z
Z
Z
Z
v dx =
v dx,
v dx =
v dx +
v dx,
C = C 1 C2 .
C

C1

C2

(B.23)
If f (x)
Z represents a force field, e.g., gravity, electromagnitic force, etc., then its line
integral
f dx represents the work done by moving along the curve. As in two dimenC

sions, work is independent of the parametrization of the curve, i.e., the particles speed of
traversal.
T

Example B.6. Our goal is to move a mass through the force field f = ( y, x, 1 )
T
T
starting from the initial point ( 1, 0, 1 ) and moving vertically to the final point ( 1, 0, 2 ) .
T
Question: does it require more work to move in a straight line x(t) = ( 1, 0, t ) or along
T
the spiral helix x(t) = ( cos t, sin t, t ) , where, in both cases, 0 t 2 ? The work line
integral has the form

Z
Z
Z 2
dy dz
dx
x
+
dt.
f dx =
y dx x dy + dz =
y
dt
dt
dt
C
C
0
Along the straight line, the amount of work is
Z

f dx =

dt = 2 .
0

As for the spiral helix,


Z

f dx =

sin2 t cos2 t + 1 dt = 0.

Thus, although we travel a more roundabout route, it takes no work to move along the
helix!
3/7/03

952

c 2003

Peter J. Olver

The reason for the second result is that the force vector field f is everywhere orthogonal
to the tangent to the curve: f t = 0, and so there is no tangential force exerted upon the
motion. In such cases, the work line integral
Z
Z
f dx =
f t ds = 0
C

automatically vanishes. In other words, it takes no work whatsoever to move in any


direction which is orthogonal to the given force vector.

B.4. Surfaces.
Curves are one-dimensional, and so can be traced out by a single parameter. Surfaces
are two-dimensional, and hence require two distinct parameters. Thus, a surface S R 3
is parametrized by a vector-valued function
x(p, q) = ( x(p, q), y(p, q), z(p, q) )

(B.24)

that depends on two variables. As the parameters (p, q) range over a prescribed
plane domain R 2 , the locus of points x(p, q) traces out the surface in space. See
Figure surf for an illustration. The parameters are often thought of as defining a system
of local coordinates on the curved surface.
We shall always assume that the surface is simple, meaning that it does not intersect
itself, so x(p, q) = x(e
p, qe) if and only if p = pe and q = qe. In practice, this condition can be
quite hard to check! The boundary
S = { x(p, q) | (p, q) }

(B.25)

of a simple surface consists of one or more simple curves, as in Figure surf . If the
underlying parameter domain is bounded and simply connected, then is a simple
closed plane curve, and so S is also a simple closed curve.
Example B.7. The simplest instance of a surface is a graph of a function. The
parameters are the x, y coordinates, and the surface coincides with the portion of the
graph of the function z = u(x, y) that lies over a fixed domain (x, y) R 2 , as
illustrated in Figure gsurf . Thus, a graphical surface has the parametric form
T

x(p, q) = ( p, q, u(p, q) ) ,

(p, q) .

Thus, the paramatrization identifies x = p and y = q, while z = u(p, q) = u(x, y) represents


the height of the surface above the point (x, y) .
For example, the upper hemisphere Sr+ of radius r centered at the origin can be
parametrized as a graph
p
(B.26)
z = r 2 x2 y 2 ,
x2 + y 2 < r 2 ,

sitting over the disk Dr = { x2 + y 2 < r2 } of radius r. The boundary of the hemisphere
is the image of the circle Cr = Dr = { x2 + y 2 = r2 } of radius r, and is itself a circle of
radius r sitting in the x, y plane: Sr+ = { x2 + y 2 = r, z = 0 }.
3/7/03

953

c 2003

Peter J. Olver

Remark : One can interpret the Dirichlet problem (14.1), (14.4) for the two-dimensional Laplace equation as the problem of finding a surface S that is the graph of a
harmonic function with a prescribed boundary S = { z = h(x, y) for (x, y) }.
Example B.8. A sphere Sr of radius r can be explicitly parametrized by two anglular variables , in the form
x(, ) = (r sin cos , r sin sin , r cos ),

0 < 2 ,

0 .

(B.27)

The reader can easily check that k x k2 = r2 , as it should be. As illustrated in Figure sangle , measures the meridial angle or longitude, while measures the azimuthal
angle or latitude. Thus, the upper hemisphere Sr+ is obtained by restricting the azimuthal
parameter to the range 0 21 . Each parameter value , corresponds to a unique
point on the sphere, except when = 0 or . All points (, 0) are mapped to the north pole
( 0, 0, r ), while all points (, ) are mapped to the south pole ( 0, 0, r ). Away from the
poles, the spherical angles provide bona fide coordinates on the sphere. Fortunately, the
polar singularities do not interfere with the overall smoothness of the sphere. Nevertheless,
one must always be careful at or near these two distinguished points.
The curves { = c } where the azimuthal angle takes a prescribed constant value are
the circular parallels of constant latitude except for the north and south poles which
are merely points. The equator is at = 21 , while the tropics of Cancer and Capricorn

are 23 12 0.41 radians above and below the equator. The curves { = c } where the
meridial angle is constant are the semi-circular meridians of constant longitude stretching
from north to south pole. Note that = 0 and = 2 describe the same meridian. In
terrestrial navigation, latitude is the angle, in degrees, measured from the equator, while
longitude is the angle measured from the Greenwich meridian.
Example B.9. A torus is a surface of the form of an inner tube. One convenient
parametrization of a particular toroidal surface is
x(, ) = ( (2 + cos ) cos , (2 + cos ) sin , sin )

for

0 , 2 . (B.28)

Note that the parametrization is 2 periodic in both and . If we introduce cylindrical


coordinates
x = r cos ,
y = r sin ,
z,
then the torus is parametrized by
r = 2 + cos ,

z = sin .

Therefore, the relevant values of (r, z) all lie on the circle


(r 2)2 + z 2 = 1

(B.29)

of radius 1 centered at (2, 0). As the polar angle increases from 0 to 2 , the circle rotates
around the z axis, and thereby sweeps out the torus.
3/7/03

954

c 2003

Peter J. Olver

Remark : The sphere and the torus are examples of closed surfaces. The requirements
for a surface to be closed are that it be simple and bounded, and, moreover, have no
boundary. In general, a subset S R 3 is bounded provided it does not stretch off infinitely
far away. More precisely, boundedness is equivalent to the existence of a fixed number
R > 0 which bounds the norm k x k < R of all points x S.
Tangents to Surfaces
Consider a surface S parameterized by x(p, q) where (p, q) . Each parametrized
curve (p(t), q(t)) in the parameter domain will be mapped to a parametrized curve C S
contained in the surface. The curve C is parametrized by the composite map
T

x(t) = x(p(t), q(t)) = ( x(p(t), q(t)), y(p(t), q(t)), z(p(t), q(t)) ) .


The tangent vector

dx
x dp x dq
=
+
dt
p dt
q dt

(B.30)

to such a curve will be tangent to the surface. The set of all possible tangent vectors to
curves passing through a given point in the surface traces out the tangent plane to the
surface at that point, as in Figure tp . Note that the tangent vector (B.30) is a linear
combination of the two basis tangent vectors

T
T
x
x
x y z
x y z
(B.31)
=
,
,
=
,
,
xp =
,
xq =
,
p
p p p
q
q q q
which therefore span the tangent plane to the surface at the point x(p, q) S. The first
basis vector is tangent to the curves where q = constant, while the second is tangent to
the curves where p = constant.
Example B.10. Consider the torus T parametrized as in (B.28). The basis tangent
vectors are

(2 + cos ) sin
sin cos
x
x
=
(2 + cos ) cos ,
= sin sin .
(B.32)

0
cos

They serve to span the tangent plane to the torus at the point x(, ). For example, at
T
the point x(0, 0) = ( 3, 0, 0 ) corresponding to the particular parameter values = = 0,
the basis tangent vectors are
T

x (0, 0) = ( 0, 3, 0 ) = 3 e2 ,

x (0, 0) = ( 0, 0, 1 ) = e3 ,

and so the tangent plane at this particular point is the (y, z)plane spanned by the standard
basis vectors e2 , e3 .
The tangent to any curve contained within the torus at the given point will be a linear
combination of these two vectors. For instance, the toroidal knot (B.16) corresponds to
the straight line
(t) = 2 t,
3/7/03

0 t 2 ,
955

(t) = 3 t,
c 2003

Peter J. Olver

in the parameter space. Its tangent vector

(4 + 2 cos 3 t) sin 2 t 3 sin 3 t cos 2 t


dx
(4 + 2 cos 3 t) cos 2 t 3 sin 3 t sin 2 t
=
dt
3 cos 3 t

lies in the tangent plane to the torus at each point. In particular, at t = 0, the knot passes
T
through the point x(0, 0) = ( 3, 0, 0 ) , and has tangent vector

0
dx
d
d
since
= 6 = 2 x (0, 0) + 3 x (0, 0)
= 2,
= 3.
dt
dt
dt
3

A point x(p, q) S on the surface is said to be nonsingular provided the basis tangent
vectors xp (p, q), xq (p, q) are linearly independent. Thus the point is nonsingular if and only
if the tangent vectors span a full two-dimensional subspace of R 3 the tangent plane to
the surface at the point. Nonsingularity ensures the smoothness of the surface at each
point, which is a consequence of the general Implicit Function Theorem, [106]. Singular
points, where the tangent vectors are linearly dependent, can take the form of corners,
cusps and folds in the surface. From now on, we shall always assume that our surface is
nonsingular meaning every point is a nonsingular point.
Linear independence of the tangent vectors is equivalent to the requirement that their
cross product is a nonzero vector:
x x

=
N=
p q

(y, z) (z, x) (x, y)


,
,
(p, q) (p, q) (p, q)

6= 0.

In this formula, we have adopted the standard notation

(x, y)
x y x y
xp xq
= det

=
yp yq
(p, q)
p q
q p

(B.33)

(B.34)

for the Jacobian determinant of the functions x, y with respect to the variables p, q, which
we already encountered in the change of variables formula (A.52) for double integrals.
The cross-product vector N in (B.33) is orthogonal to both tangent vectors, and hence
orthogonal to the entire tangent plane. Therefore, N defines a normal vector to the surface
at the given (nonsingular) point.
Example B.11. Consider a surface S parametrized as the graph of a function z =
u(x, y), and so, as in Example B.7
T

x(x, y) = ( x, y, u(x, y) ) ,

(x, y) .

The tangent vectors


x
=
x
3/7/03

u
1, 0,
x

,
956

x
=
y

u
0, 1,
y

c 2003

Peter J. Olver

span the tangent plane sitting at the point (x, y, u(x, y) on S. The normal vector is
x x

=
N=
x y

u
u

,
,1
x
y

and points upwards, as in Figure graphN . Note that every point on the graph is nonsingular.
The unit normal to the surface at the point is a unit vector orthogonal to the tangent
plane, and hence given by
xp x q
N
=
.
(B.35)
n=
kNk
k xp xq k
In general, the direction of the normal vector N depends upon the order of the two parameters p, q. Computing the cross product in the reverse order, xq xp = N, reverses the
sign of the normal vector, and hence switches its direction. Thus, there are two possible
unit normals to the surface at each point, namely n and n. For a closed surface, one
normal points outwards and one points inwards.
When possible, a consistent (meaning continuously varying) choice of a unit normal
serves to define an orientation of the surface. All closed surfaces, and most other surfaces
can be oriented. The usual convention for closed surfaces is to choose the orientation
defined by the outward normal. The simplest example of a non-orientable surface is the
Mobius strip obtained by gluing together the ends of a twisted strip of paper; see Exercise
.
Example B.12. For the sphere of radius r parametrized by the spherical angles as
in (B.27), the tangent vectors are

r cos cos
r sin sin
x
x
= r sin cos ,
=
r sin cos .

r sin
0

These vectors are tangent to, respectively, the meridians of constant longitude, and the
parallels of constant latitude. The normal vector is
2 2

r sin cos
x x 2 2

= r sin sin = r sin x.


N=
(B.36)

2
r cos sin

Thus N is a non-zero multiple of the radial vector x, except at the north or south poles
when = 0 or . This reconfirms our earlier observation that the poles are problematic
points for the spherical angle parametrization. The unit normal
n=

x
N
=
kNk
r

determined by the spherical coordinates , is the outward pointing normal. Reversing


the order of the angles, , , would lead to the outwards normal n = x/r.
3/7/03

957

c 2003

Peter J. Olver

Remark : As we already saw in the example of the hemisphere, a given surface can be
parametrized in many different ways. In general, to change parameters
p = g(e
p, qe),

q = h(e
p, qe),

e . Many
requires a smooth, invertible map between the two parameter domains
interesting surfaces, particularly closed surfaces, cannot be parametrized in a single consistent manner that satisfies the smoothness constraint (B.33) on the entire surface. In
such cases, one must assemble the surface out of pieces, each parametrized in the proper
manner. The key problem in cartography is to find conveniennt parametrizations of the
globe that do not significantly distort the geogrphical features of the planet.
A surface is piecewise smooth if it can be constructed by gluing together a finite
number of smooth parts, joined along piecewise smooth curves. For example, a cube is
a piecewise smooth surface, consisting of squares joined along straight line segments. We
shall rely on the readers intuition to formalize these ideas, leaving a rigorous development
to a more comprehensive treatment of surface geometry, e.g., [40].

B.5. Surface Integrals.


As with spatial line integrals, there are two important types of surface integral. The
first is the integration of a scalar field with respect to surface area. A typical application
is to compute the area of a curved surface or the mass and cneter of mass of a curved shell
of possibly variable density. The second type is the surface integral that computes the flux
associated with a vector field through an oriented surface. Applications appear in fluid
mechanics, electromagnetism, thermodynamics, gravitation, and many other fields.
Surface Area
According to (B.10), the length of the cross product of two vectors measures the area
of the parallelogram they span. This observation underlies the proof that the length of the
normal vector to a surface (B.35), namely
k N k = k xp xq k,
is a measure of the infinitesimal element of surface area, denoted
dS = k N k dp dq = k xp xq k dp dq.

(B.37)

The total area of the surface is found by summing up these infinitesimal contributions,
and is therefore given by the double integral
ZZ
ZZ
area S =
dS =
k xp xq k dp dq
S

s
(B.38)
2
2
2
ZZ
(z, x)
(x, y)
(y, z)
=
+
+
dp dq.
(p, q)
(p, q)
(p, q)

3/7/03

958

c 2003

Peter J. Olver

The surfaces area does not depend upon the parametrization used to compute the integral.
In particular, if the surface is parametrized by x, y as the graph z = u(x, y) of a function
over a domain (x, y) , then the surface area integral reduces to the familiar form
s
2
ZZ
ZZ
u
u
(B.39)
+
area S =
dS =
1+
dx dy.
x
y
S

A detailed justification of these formulae can be found in [9, 48].


Example B.13. The well-known formula for the surface area of a sphere is a simple
consequence of the integral formula (B.38). Using the parametrization by spherical angles
(B.27) and the formula (B.36) for the normal, we find
Z 2 Z
ZZ
dS =
r2 sin d d = 4 r 2 .
(B.40)
area Sr =
0

Sr

Fortunately, the problematic poles do not cause any difficulty in the computation, since
they contribute nothing to the surface area integral.
Alternatively, we can compute the area of one hemisphere Sr+ by realizing it as a
graph
p
for
x2 + y 2 1,
z = r 2 x2 y 2
over the disk of radius r, and so, by (B.39),
ZZ s
y2
x2
1+ 2
+
dx dy
area Sr+ =
r x2 y 2
r 2 x2 y 2

Z rZ 2
ZZ
r
r
p
p
dx dy =
d d = 2 r 2 ,
=
2
2
2
2
2
r x y
r

0 0

where we used polar coordinates x = cos , y = sin to evaluate the final integral. The
area of the entire sphere is twice the area of the hemisphere.
Example B.14. Similarly, to compute the surface area of the torus T parametrized
in (B.28), we use the tangent vectors in (B.32) to compute the normal to the torus:

(2 + cos ) cos cos


N = x x = (2 + cos ) cos sin ,
with
k x x k = 2 + cos .
(2 + cos ) sin
Therefore,

area T =

2 Z 2

(2 + cos ) d d = 8 2 .

If S R 3 is a surface with finite area, the mean or average of a scalar function


f (x, y, z) over S is given by
ZZ
1
f dS.
(B.41)
MS [ f ] =
area S
S
3/7/03

959

c 2003

Peter J. Olver

For example, the mean of a function over a sphere Sr = { k x k = r } of radius r is explicitly


given by
Z 2 Z
ZZ
1
1
M Sr [ f ] =
f (x) dS =
F (r, , ) sin d d,
(B.42)
4 r2
4 0 0
k x k=r
where F (r, , ) is the spherical coordinate expression for the scalar function f . As usual,
the mean lies between the maximum and minimum values of the function on the surface:
min f MS [ f ] max f.
S

In particular, the center of mass C of a surface (assuming it has constant density) is


T
equal to the mean of the coordinate functions x = ( x, y, z ) , so
1
C = ( MS [ x ] , M S [ y ] , M S [ z ] ) =
area S
T

ZZ

x dS,
S

ZZ

y dS,
S

ZZ

z dS
S

. (B.43)

Thus, the center of mass of a hemisphere is


More generally, the integral of a scalar field u(x, y, z) over the surface is given by
ZZ
ZZ
u dS =
u(x(p, q), y(p, q), z(p, q)) k xp xq k dp dq.
(B.44)
S

If S represents a thin curved shell, and u = (x) the density of the material at position
x S, then the surface integral (B.44) represents the total mass of the shell. For example,
the integral of u(x, y, z) over a hemisphere Sr+ of radius r can be evaluated by either of
the formulae
ZZ
Z 2 Z /2
u(r cos sin , r sin sin , r cos ) r 2 sin d d
u dS =
+
0
Sr
0
(B.45)
ZZ
p
r
2
2
2
p
u(x, y, r x y ) dx dy,
=
r 2 x2 y 2
x2 +y 2 r 2

depending upon whether one prefers spherical or graphical coordinates.


Flux Integrals

Now assume that S is an oriented surface with chosen unit normal n. If v = ( u, v, w )


is a vector field, then the surface integral

ZZ
ZZ
ZZ
u
x
x
p
q

det v yp yq dp dq
v xp xq dp dq =
(B.46)
v n dS =

w z p zq

of the normal component of v over the entire surface measures its flux through the surface.
An alternative common notation for the flux integral is
ZZ
ZZ
v n dS =
u dy dz + v dz dx + w dx dy
(B.47)
S

3/7/03

960

c 2003

Peter J. Olver

ZZ

(z, x)
(x, y)
(y, z)
+ v(x, y, z)
+ w(x, y, z)
u(x, y, z)
(p, q)
(p, q)
(p, q)

dx dy,

Note how the Jacobian determinant notation (B.34) seamlessly interacts with the integration. In particular, if the surface is the graph of a function z = h(x, y), then the surface
integral reduces to the particularly simple form

ZZ
ZZ
z
z
u(x, y, z)
v n dS =
+ v(x, y, z)
+ w(x, y, z) dp dq
(B.48)
x
y
S

The flux surface integral relies upon the consistent choice of an orientation or unit
normal on the surface. Thus, flux only makes sense through an oriented surface it
doesnt make sense to speak of flux through a Mobius band. If we switch normals,
using, say, the inward instead of the outward normal, then the surface integral changes
sign just like a line integral if we reverse the orientation of a curve. Similarly, if we
decompose a surface into the union of two or more parts, with only their boundaries in
common, then the surface integral similarly decomposes into a sum of surface integrals.
Thus,
ZZ
ZZ
v n dS =
v n dS,
S
S
ZZ
ZZ
ZZ
(B.49)
v n dS =
v n dS +
v n dS,
S = S 1 S2 .
S

S1

S2

In the first formula, S denotes the surface S with the reverse orientation. In the second
formula, S1 and S2 are only allowed to intersect along their boundaries; moreover, they
must be oriented in the same manner as S, i.e., have the same unit normal direction.
Example B.15. Let S denote the triangular surface given by that portion of the
plane x + y + z = 1 that lies inside the positive orthant { x 0, y 0, z 0 }, as in
T
Figure tri3 . The flux of the vector field v = ( y, x z, 0 ) through S equals the surface
integral
ZZ
y dy dz + x z dz dx,

where we orient S by choosing the upwards pointing normal. To compute, we note that
S can be identified as the graph of the function z = 1 x y lying over the triangle
T = { 0 x 1, 0 y 1 x }. Therefore, by (B.47),

ZZ
ZZ
(y, 1 x y)
(1 x y, x)
y dy dz + x z dz dx =
y
+ x (1 x y)
dx dy
(x, y)
(x, y)
S
T
Z 1 Z 1x
Z 1

1 1
1 2
1 3
17
=
(1 x) (y + x) dy dx =
dx = 24
.
2 + 2 x 2 x + 2 x
0

If v represents the velocity vector field for a steady state fluid flow, then its flux
integral (B.46) tells us the total volume of fluid passing through S per unit time. Indeed,
at each point on S, the volume fluid that flows across a small part the surface in unit time
3/7/03

961

c 2003

Peter J. Olver

will fill a thin cylinder whose base is the surface area element dS and whose height v n
is the normal component of the fluid veclocity v, as pictured in Figure fluxs . Summing
(integrating) all these flux cylinder volumes over the surface results in the flux integral. The
choice of orientation or unit normal specifies the convention for measuring the direction of
positive flux through the surface. If S is a closed surface, and we choose n to be the unit
outward normal, then the flux integral (B.46) represents the net amount of fluid flowing
out of the solid region bounded by S per unit time.
T

Example B.16. The vector field v = ( 0, 0, 1 ) represents a fluid moving with constant velocity in the vertical direction. Let us compute the fluid flux through a hemisphere

n
o
p

Sr+ = z = r2 x2 y 2 x2 + y 2 1 ,

sitting over the disk Dr of radius r in the x, y plane. The flux integral over Sr+ is computed
using (B.48), so
ZZ
ZZ
ZZ
dx dy = r 2 .
dx dy =
v n dS =
Sr+

Sr+

Dr

The resulting double integral is just the area of the disk. Indeed, in this case, the value of
the flux integral is the same for all surfaces z = h(x, y) sitting over the disk D r .
This example provides a particular case of a surface-independent flux integral, which
are defined in analogy with the path-independent line integrals that we encountered earlier.
In general, a flux integral is called surface-independent if
ZZ
ZZ
v n dS =
v n dS
(B.50)
S1

S2

whenever the surfaces S1 and S2 have a common boundary S1 = S2 . In other words,


the value of the integral depends only upon the boundary of the surface. The veracity of
(B.50) requires that the surfaces be oriented in the same manner. For instance, if they
do not cross, then the combined surface S = S1 S2 is closed, and one uses the outward
pointing normal on one surface and the inward pointing normal on the other. In more
complex situations, one checks that the two surfaces induce the same orientation on their
common boundary. (We defer a discussion of the boundary orientation until later.) Finally,
applying (B.49) to the closed surface S = S1 S2 and using the prescribed orientations,
we deduce an alternative characterization of surface-independent vector fields.
Proposition B.17. A vector field leads to a surface-independent flux integral if and
only if
ZZ
S

v n dS = 0

(B.51)

for every closed surface S contained in the domain of definition of v.


A fluid is incompressible when its volume is unaltered by the flow. Therefore, in the
absence of sources or sinks, there cannot be any net inflow or outflow across a simple closed
surface bounding a region occupied by the fluid. Thus, the flux integral over a closed surface
3/7/03

962

c 2003

Peter J. Olver

must vanish:

ZZ

v n dS = 0. Proposition B.17 implies that the fluid velocity vector

field defines a surface-independent flux integral. Thus, the flux of an incompressible fluid
flow through any surface depends only on the (oriented) boundary curve of the surface!

B.6. Volume Integrals.


Volume or triple integrals take place over domains R 3 representing solid threedimensional bodies. A simple example of such a domain is a ball
Br (a) = { x | k x a k < r }

(B.52)

of radius r > 0 centered at a point a R 3 . Other examples of domains include solid cubes,
solid cylinders, solid tetrahedra, solid tori (doughnuts and bagels), solid cones, etc.
In general, a subset R 3 is open if, for every point x , a small open ball
B (a) centered at a of radius = (a) > 0, which may depend upon a, is also
contained in . In particular, the ball (B.52) is open. The boundary of an open subset
consists of all limit points which are not in the subset. Thus, the boundary of the open
ball Br (a) is the sphere Sr (a) = { k x a k = r } of radius r centered at the point a. An
open subset is called a domain if its boundary consists of one or more simple, piecewise
smooth surfaces. We are allowing corners and edges in the bounding surfaces, so that an
open cube will be a perfectly valid domain.
A subset R 3 is bounded provided it fits inside a sphere of some (possibly large)
radius. For example, the solid ball Br = { k x k < R } is bounded, while its exterior Er =
{ k x k > R } is an unbounded domain. The sphere SR = { k x k = R } is the common
boundary of the two domains: SR = Br = ER . Indeed, any simple closed surface
separates R 3 into two domains that have a common boundary its interior , which is
bounded, and its unbounded exterior .
The boundary of a bounded domain consists of one or more closed surfaces. For
instance, the solid annular domain

Ar,R = 0 < r < k x k < R


(B.53)

consisting of all points lying between two concentric spheres of respective radii r and R
has boundary given by the two spheres: Ar,R = Sr SR . On the other hand, setting
r = 0 in (B.53) leads to a punctured ball of radius R whose center point has been removed.
A punctured ball is not a domain, since the center point is part of the boundary, but is
not a bona fide surface.
If the domain R 3 represents a solid body, and the scalar field (x, y, z) represents
its density at a point (x, y, z) , then the triple integral
ZZZ
(x, y, z) dx dy dz
(B.54)

equals the total mass of the body. In particular, the volume of is equal to
ZZZ
vol =
dx dy dz.

(B.55)

3/7/03

963

c 2003

Peter J. Olver

Triple integrals can be directly evaluated when the domain has the particular form

= (x, y) < z < (x, y), (x) < y < (x), a < x < b
(B.56)

where the z coordinate lies between two graphical surfaces sitting over a common domain
in the (x, y)plane that is itself of the form of (A.47) used to evaluate double integrals;
see Figure triple . In such cases we can evaluate the triple integral by iterated integration
first with respect to z, then with respect to y and, finally, with respect to x:
! !
ZZZ
Z b Z (x) Z (x,y)
u(x, y, z) dx dy dz =
u(x, y, z) dz dy dx.
(B.57)

(x)

(x,y)

A similar result holds for other orderings of the coordinates.


Fubinis Theorem, [106, 105], assures us that the result of iterated integration does
not depend upon the order in which the variables are integrated. Of course, the domain
must be of the requisite type in order to write the volume integral as repeated single
integrals. More general triple integrals can be evaluated by chopping the domain up into
disjoint pieces that have the proper form.
Example B.18. The volume of a solid ball BR of radius R can be computed as
follows. We express the domain of integration x2 + y 2 + z 2 < R2 in the form
p
p
p
p
R < x < R, R2 x2 < y < R2 x2 , R2 x2 y 2 < z < R2 x2 y 2 .
Therefore, in accordance with (B.57),
ZZZ
Z Z 2

R x2

dx dy dz =

BR

=
=
=

R
R
R
R
R

R2 x2

R2 x2

R2 x2 y 2

2 2 2 dz
R x y
!

p
R2 x2 y 2 dy

(R x ) dx =

x2
R x
3
2

recovering the standard formula, as it should.

dy

dx

dx

p
y R2 x2 y 2 + (R2 x2 ) sin1
2

R2 x2

R2 x2

dx
R2 x2 y = R2 x2

x = R

4
R3 ,
3

Change of Variables
Sometimes, an inspired change of variables can be used to simplify a volume integral.
If
x = f (p, q, r),

y = g(p, q, r),

z = h(p, q, r),

(B.58)

is an invertible change of variables meaning that each point (x, y, z) corresponds to a


unique point (p, q, r) then

ZZZ
ZZZ
(x, y, z)

dp dq dr.
u(x, y, z) dx dy dz =
U (p, q, r)
(B.59)

(p,
q,
r)

D
3/7/03

964

c 2003

Peter J. Olver

Here
U (p, q, r) = u(x(p, q, r), y(p, q, r), z(p, q, r))
is the expression for the integrand in the new coordinates, while D is the domain consisting
of all points (p, q, r) that map to points (x, y, z) in the original domain. Invertibility
requires that each point in D corresponds to a unique point in . The change in volume
is governed by the absolute value of the three-dimensional Jacobian determinant

xp xq xr
(x, y, z)
= det yp yq yr = xp xq xr
(B.60)
(p, q, r)
zp zq zr

for the change of variables. The identification of the vector triple product (B.60) with
an (infinitesimal) volume element lies behind the justification of the change of variables
formula; see [9, 48] for a detailed proof.
By far, the two most important cases are cylindrical and spherical coordinates. Cylindrical coordinates correspond to replacing the x and y coordinates by their polar counterparts, while retaining the vertical z coordinate unchanged. Thus, the change of coordinates
has the form
x = r cos ,
y = r sin ,
z = z.
(B.61)
The Jacobian determinant for

xr
(x, y, z)
= det yr
(r, , z)
zr

cylindrical coordinates is

x xz
cos r sin
y yz = det sin r cos
z zz
0
0

0
0 = r.
1

(B.62)

Therefore, the general change of variables formula (B.59) tells us the formula for a triple
integral in cylindrical coordinates:
ZZZ
ZZZ
f (x, y, z) dx dy dz =
f (r cos , r sin , z) r dr d d.
(B.63)
Example B.19. For example, consider an ice cream cone

Ch = x2 + y 2 < z 2 , 0 < z < h = r < z, 0 < z < h

of height h plotted in Figure cone . To compute its volume, we express the domain in
terms of the cylindrical coordinates, leading to
ZZZ
Z hZ 2 Z z
Z h
dx dy dz =
r dr d dz =
z 2 dz = 31 h3 .
Ch

Spherical coordinates are denoted by r, , , where


x = r sin cos ,
y = r sin sin ,
z = r cos .
(B.64)
p
Here r = k x k = x2 + y 2 + z 2 represents the radius, 0 is the azimuthal angle or
latitude, while 0 < 2 is the meridial angle or longitude. The reader may recall that
3/7/03

965

c 2003

Peter J. Olver

we already encountered these coordinates in our parametrization (B.27) of the sphere. It is


important to distinguish between the spherical r, and the cylindrical r, even though
the same symbols are used, they represent different quantities.
A short computation proves that the spherical coordinate Jacobian determinant is

xr x x
(x, y, z)
= det yr y y
(r, , )
zr z z

(B.65)
sin cos r cos cos r sin sin
= det sin sin r cos sin r sin cos = r2 sin .
cos
r sin
0
Therefore, a triple integral is evaluated in spherical coordinates according to the formula
ZZZ
ZZZ
f (x, y, z) dx dy dz =
F (r, , ) r 2 sin dr d d,
(B.66)

where we rewrite the integrand


F (r, , ) = f (r sin cos , r sin sin , r cos )

(B.67)

as a function of the spherical coordinates.


Example B.20. The integration required in Example B.18 to compute the volume
of a ball BR of radius R can be considerably
simplified by switching over

to spherical
coordinates. The ball is given by BR = 0 r < R, 0 , 0 < 2 . Thus, using
(B.66), we compute
ZZZ

dx dy dz =
BR

RZ Z 2
0

r sin d d dr =

4 r2 dr =
0

4
3

R3 .

(B.68)

The reader may note that the next-to-last integrand represents the surface area of the
sphere of radius R. Thus, we are, in effect, computing the volume by summing up (i.e.,
integrating) the surface areas of concentric thin spherical shells.
Remark : Sometimes, we will be sloppy and use the same letter for a function in an
alternative coordinate system. Thus, we may use f (r, , ) to represent the spherical
coordinate form (B.67) of a function f (x, y, z). Technically, this is not correct! However,
the clarity and intuition sometimes outweighs the pedantic use of a new letter each time we
change coordinates. Moreover, in geometry and modern physical theories, [dg], the symbol
f represents an intrinsic scalar field, and f (x, y, z) and f (r, , ) merely its incarnations
in two different coordinate charts on R 3 . Hopefully, this will be clear from the context.

B.7. Gradient, Divergence, and Curl.


There are three important vector differential operators that play a ubiquitous role in
three-dimensional vector calculus, known as the gradient, divergence and curl. We have
already encoutered their two-dimensional counterparts in Chapter A.
3/7/03

966

c 2003

Peter J. Olver

The Gradient
We begin with the three-dimensional version of the gradient operator

ux
u = uy .
uz

(B.69)

d
u dx u dy u dz

u(x(t), y(t), z(t)) =


+
+
= u x
dt
x dt
y dt
z dt

(B.70)

The gradient defines a linear operator that maps a scalar function u(x, y, z) to the vector
field whose components are its partial derivatives with respect to the Cartesian coordinates.
T
If x(t) = ( x(t), y(t), z(t) ) is any parametrized curve, then the rate of change in the
function u as we move along the curve is given by the inner product

between the gradient and the tangent vector to the curve. Therefore, as we reasoned
earlier in the planar case, the gradient u points in the direction of steepest increase in
the function u, while its negative u points in the direction of steepest decrease. For
example, if u(x, y, z) represents the temperature at a point (x, y, z) in space, then u
points in the direction in which temperature is getting the hottest, while u points in
the direction it gets the coldest. Therefore, if one wants to cool down as rapidly as possible,
one should move in the direction of u at each instant, which is the direction of the
flow of heat energy. Thus, the path x(t) to be followed for the fastest cool down will be a
solution to the gradient flow equations

(B.71)

x = u,
or, explicitly,
u
dx
=
(x, y, z),
dt
x

dy
u
=
(x, y, z),
dt
y

dz
u
=
(x, y, z).
dt
z

A solution x(t) to such a system of ordinary differential equations will experience continuously decreasing temperature. In Chapter 18, we will learn how to use such gradient flows
to locate and numerically approximate the minima of functions.
The set of all points where a scalar field u(x, y, z) has a given value,
u(x, y, z) = c

(B.72)

for some fixed constant c, is known as a level set of u. If u measures temperature, then
its level sets are the isothermal surfaces of equal temperature. If u is sufficiently smooth,
most of its level sets are smooth surfaces. In fact, if u 6= 0 at a point, then one can prove
that all nearby level sets are smooth surfaces near the point in question. This important
fact is a consequence of the general Implicit Function Theorem, [106]. Thus, if u 6= 0 at
all points on a level set, then the level set is a smooth surface, and, if bounded, a simple
closed surface. (On the other hand, finding an explicit parametrization of a level set may
be quite difficult!)
3/7/03

967

c 2003

Peter J. Olver

Theorem B.21. If nonzero, the gradient vector u 6= 0 defines the normal direction
to the level set { u = c } at each point.
Proof : Indeed, suppose x(t) is any curve contained in the level set, so that
u(x(t), y(t), z(t)) = c

for all

t.

Since c is constant, the derivative with respect to t is zero, and hence, by (B.70),
d

u(x(t), y(t), z(t)) = u x = 0,


dt

which implies that the gradient vector u is orthogonal to the tangent vector x to the
curve. Since this holds for all such curves contained within the level set, the gradient must
be orthogonal to the entire tangent plane at the point, and hence, if nonzero, defines a
normal direction to the level surface.
Q.E.D.
Physically, Theorem B.21 tells us that the direction of steepest increase in temperature
is perpendicular to the isothermal surfaces at each point. Consequently, the solutions to
the gradient flow equations (B.71) form an orthogonal system of curves to the level set
surfaces of u, and one should follow these curves to minimize the temperature as rapidly
as possible. Similarly, in a steady state fluid flow, the fluid potential is represented by a
scalar field (x, y, z). Its gradient v = determines the fluid velocity at each point. The
streamlines followed by the fluid particles are the solutions to the gradient flow equations

x = v = , while the level sets of are the equipotential surfaces. Thus, fluid particles
flow in a direction orthogonal to the equipotential surfaces.
Example B.22. The level sets of the radial function u = x2 + y 2 + z 2 are the
T
concentric spheres centered at the origin. Its gradient u = ( 2 x, 2 y, 2 z ) = 2 x points in
the radial direction, orthogonal to each spherical level set. Note that u = 0 only at the
origin, which is a level set, but not a smooth surface.
The radial vector also specifies the direction of fastest increase (decrease) in the function u. Indeed, the solution to the associated gradient flow system (B.71), namely

x = 2x

is

x(t) = x0 e2 t ,

where x0 = x(0) is the initial position. Therefore, to decrease the function u as rapidly as
possible, one should follow a radial ray into the origin.
r=

Example B.23. An implicit equation for the torus (B.28) is obtained by replacing
p
x2 + y 2 in (B.29). In this manner, we are led to consider the level sets of the function
p
u(x, y, z) = x2 + y 2 + z 2 4 x2 + y 2 = c,
(B.73)

with the particular value c = 3 corresponding to (B.28). The gradient of the function is
!T

4y
4x
, 2y p
, 2z
,
(B.74)
u(x, y, z) = 2 x p
x2 + y 2
x2 + y 2

which is well-define except on the z axis, where x = y = 0. Note that F 6= 0 unless z = 0


and x2 + y 2 = 4. Therefore, the level sets of u are smooth, toroidal surfaces except for z
axis and the circle of radius 2 in the (x, y) plane.
3/7/03

968

c 2003

Peter J. Olver

Divergence and Curl


The second important vector differential operator is the divergence,
div v = v =

v1
v
v
+ 2+ 3 .
x
y
z

(B.75)

The divergence maps a vector field v = ( v1 , v2 , v3 ) to a scalar field f = v. For


T
example, the radial vector field v = ( x, y, z ) has constant divergence v = 3.
In fluid mechanics, the divergence measures the local, instantaneous change in the
volume of a fluid packet as it moves. Thus, a steady state fluid flow is incompressible, with
unchanging volume, if and only if its velocity vector field is divergence-free: v 0. The
connection between incompressibility and the earlier zero-flux condition will be addressed
in the Divergence Theorem B.36 below.
The composition of divergence and gradient
u = u = uxx + uyy + uzz
produces the Laplacian operator, just as in two dimensions. Indeed, as we shall see, except
for the missing minus sign and the all-important boundary conditions, this is effectively
the same as the self-adjoint form of the three-dimensional Laplacian:
u = u = u.
See (Lapsa3 ) below for details.
The third important vector differential operator is the curl , which, in three dimensions,
maps vector fields to vector fields. It is most easily memorized in the form of a (formal)
3 3 determinant

v3
v
2
y

v
e
x
1
1
v1
v3

curl v = v =
(B.76)
z x = det y v2 e2 ,

z v3 e 3
v
v1
2

x
y
in analogy with the determinantal form (B.6) of the cross product. For instance, the radial
T
vector field v = ( x, y, z ) has zero curl:

x x e 1
v = det y y e2 = 0.
z z e 3

This is indicative of the lack or any rotational effect of the induced flow.
If v represents the velocity vector field of a steady state fluid flow, its curl v
measures the instantaneous rotation of the fluid flow at a point. When non-zero, the
direction of the curl vector represents the axis of rotation, while its magnitude k v k
measures the instantaneous angular velocity of the swirling flow. Physically, if we place a
3/7/03

969

c 2003

Peter J. Olver

microscopic turbine in the fluid so that its shaft points in the direction specified by a unit
vector n, then its rate of spin will be proportional to component of the curl vector v
in the direction of its shaft. This is equal to the dot product
n ( v) = k v k cos ,
where is the angle between n and the curl vector. Therefore, the maximal rate of spin
will occur when = 0, and so the shaft of the turbine lines up with the direction of the curl
vector v. In this orientation, the angular velocity of the turbine will be proportional
to its magnitude k v k. On the other hand, if the axis of the turbine is orthogonal to
te direction of the curl, then it will not rotate. If v 0, then there is no net motion
of a turbine, not matter which orientation it is placed in the fluid flow. Thus, a flow with
zero curl is irrotational . The precise connection between this definition and the earlier
zero circulation condition will be explained shortly.
Example B.24. Consider a helical fluid flow with velocity vector
T

v = ( y, x, 1 ) .

Integrating the ordinary differential equations x = v, namely

x = y,

y = x,

z = 1,

with initial conditions x(0) = x0 , y(0) = y0 , z(0) = z0 gives the flow


x(t) = x0 cos t y0 sin t,

y(t) = x0 sin t + y0 cos t,

z(t) = z0 + t. (B.77)

Therefore, the fluid particles move along helices spiraling up the z axis, as illustrated in
Figure hel .
The divergence of the vector field v is
v =

( y) +
x+
1 = 0,
x
y
z

and hence the flow is incompressible. Indeed, any fluid packet will spiral up the z axis
unchanged in shape, and so its volume does not change.
The curl of the velocity is

1
x
y

z

= 0 ,
v =
(
y)

1
z
x

x
( y)
x
y

which points along the z-axis. This reflects the fact that the flow is spiraling up the z-axis.
If a turbine is placed in the fluid at an angle with the z-axis, then its rate of rotation
will be proportional to 2 cos .
3/7/03

970

c 2003

Peter J. Olver

Example B.25. Any planar vector field v = ( v1 (x, y), v2 (x, y) ) can be identified
with a three-dimensional vector field
v = ( v1 (x, y), v2 (x, y), 0 )

that has no vertical component. If v represents a fluid velocity, then the fluid particles
remain on horizontal planes { z = c }, and the individual planar flows are identical. Its
three-dimensional curl
T

v1
v2

v = 0, 0,
x
y
is a purely vertical vector field, whose third component agrees with the scalar two-dimensional curl (A.18) of v. This provides the direct identification between the two- and threedimensional versions of the curl operation. Indeed, our analysis of flows around airfoils
in Chapter 15 directly relied upon this identification between two- and three-dimensional
flows.
Interconnections and Connectedness
The three basic vector differential operators gradient, curl and divergence are
intimately inter-related. The proof of the key identities relies on the equality of mixed partial derivatives, which in turn requires that the functions involved are sufficiently smooth.
We leave the explicit verification of the key result to the reader.
Proposition B.26. If u is a smooth scalar field, then u 0. If v is a smooth
vector field, then ( v) 0.
Therefore, the curl of any gradient vector field is automatically zero. As a consequence,
all gradient vector fields represent irrotational flows. Also, the divergence of any vector field
that is a curl is also automatically zero. Thus, all curl vector fields represent incompressible
flows. On the other hand, the divergence of a gradient vector field is the Laplacian of the
underlying potential, as we previously noted, and hence is zero if and only if the potential
is a harmonic function.
The converse statements are almost true. As in the two-dimensional case, the precise
statement of this result depends upon the topology of the underlying domain. In two
dimensions, we only had to worry about whether or not the domain contained any holes,
i.e., whether or not the domain was simply connected. Similar concerns arise in three
dimensions. Moreover, there are two possible classes of holes in a solid domain, and so
there are two diferent types of connectivity. For lack of a better terminology, we introduce
the following definition.
Definition B.27. A domain R 3 is said to be
(a) 0connected or pathwise connected if there is a curve C connecting any two
points x0 , x1 , so that C = { x0 , x1 }.

We use the notation C to denote the endpoints of a curve C.

3/7/03

971

c 2003

Peter J. Olver

(b) 1connected if every unknotted simple closed curve C is the boundary, C = S


of an oriented surface S .
(c) 2connected if every simple closed surface S is the boundary, S = D of a
subdomain D .
Remark : The unknotted condition is to avoid considering wild curves that fail to
bound any oriented surface S R 3 whatsoever.
For example, R 3 is both 0, 1 and 2connected, as are all solid balls, cubes, tetrahedra,
solid cylinders, and so on. A disjoint union of balls
0conected, although
it does
is notp

remain both 1 and 2connected. The domain = 0 r < x2 + y 2 < R lying between
two cylinders is not 1conected since it has a one-dimensional hole drilled through it.
Indeed, if C is any closed curve that encircles the inner cylinder, then every bounding
surface S with S = C must pass across the inner cylinder and hence will not lie entirely
within the domain. On the other hand, this cylindrical domain is both 0 and 2conected
even an annular surface that encircles the inner cylinder will bound a solid annular
domain contained inside . Similarly, the domain = { 0 r < k x k < R } between two
concentric spheres is 0 and 1connected, but not 2connected owing to the spherical cavity
inside. Any closed curve C will bound a surface S ; for instance, a circle going
around the equator of the inner sphere will still bound a hemispherical surface that does
not pass through the spherical cavity. On the other hand, a sphere that lies between the
inner and outer spheres will not bound a solid domain contained within the domain. A
full discussion of the topology underlying the various types of connectivity, the nature of
holes and cavities, and their connection with the existence of scalar and vector potentials,
must be deferred to a more advanced course in differential topology, [17, 55].
We can now state the basic theorem relating the connectivity of domains to the kernels
of the fundamental vector differential operators; see [17] for details.
Theorem B.28. Let R 3 be a domain.
(a) If is 0connected, then a scalar field u(x, y, z) defined on all of has vanishing
gradient, u 0, if and only if u(x, y, z) = constant.
(b) If is 1connected, then a vector field v(x, y, z) defined on all of has vanishing
curl, v 0, if and only if there is a scalar field , known as a scalar potential
for v, such that v = .
(c) If is 2connected, then a vector field v(x, y, z) defined on all of has vanishing
divergence, v 0, if and only if there is a vector field w, known as a vector
potential for v, such that v = w.
If v represents the velocity vector field of a steady-stae fluid flow, then the curl-free
condition v 0 corresponds to an irrotational flow. Thus, on a 2connected domain,
every irrotational flow field v has a scalar potential with = v. The divergence-free
condition v 0 corresponds to an incompressible flow. If the domain is 1connected,
every incompressible flow field v has a vector potential w that satisfies w = v. The
vector potential can be viewed as the three-dimensional analog of the stream function for
planar flows. If the fluid is both irrotational and incompressible, then its scalar potential
3/7/03

972

c 2003

Peter J. Olver

satisfies
0 = v = = ,
which is Laplaces equation! Thus, just as in the two-dimensional case, the scalar potential
to an irrotational, incompressible fluid flow is a harmonic function. This fact is used in
modeling many problems arising in physical fluids, including water waves, [Lighthill].
Unfortunately, in three dimensions there is no counterpart of complex function theory
to represent the solutions of the Laplace equation, or to connect the vector and scalar
potentials.
Example B.29. The vector field
v = ( y, x, 1 )

that generates the helical flow (B.77) satisfies v = 0, and so is divergence-free, reconfirming our observation that the flow is incompressible. Since v is defined on all of R 3 ,
Theorem B.28 assures us that there is a vector potential w that satisfies w = v. One
candidate for the vector potential is

T
w = y, 0, 12 x2 + 21 y 2 .
The helical flow is not irrotational, and so it does not admit a scalar potential.

Remark : The construction of a vector potential is not entirely straightforward, but we


will not dwell on this problem. Unlike a scalar potential which, when it exists, is uniquely
defined up to a constant, there is, in fact, quite a bit of ambiguity in a vector potential.
Adding in any gradient,
e = w +
w
will give an equally valid vector potential. Indeed, using Proposition B.26, we have
e = w + = w.
w

Thus, any vector field of the form


w=

y2

x2
,
,
+
+
y+
x y
2
2
z

where (x, y, z) is an arbitrary function, is also a valid vector potential for the helical
T
vector field v = ( y, x, 1 ) .

B.8. The Fundamental Integration Theorems.


In three-dimensional vector calculus there are 3 fundamental differential operators
gradient, curl and divergence. There are also 3 types of integration line, surface
and volume integrals. And, not coincidentally, there are 3 basic theorems that generalize the Fundamental Theorem of Calculus to line, surface and volume integrals in threedimensional space. In all three results, the integral of some differentiated quantity over a
curve, surface, or domain is related to an integral of the quantity over its boundary. The
3/7/03

973

c 2003

Peter J. Olver

first theorem relates the line integral of a gradient over a curve to the values of the function
at the boundary or endpoints of the curve. Stokes Theorem relates the surface integral of
the curl of a vector field to the line integral of the vector field around the boundary curve
of the surface. Finally, the Divergence Theorem, also known as Gauss Theorem, relates
the volume integral of the divergence of a vector field to the surface integral of that vector
field over the boundary of the domain.
The Fundamental Theorem for Line Integrals
We begin with the Fundamental Theorem for line integrals. This is identical to the
planar version, as stated earlier in Theorems A.20 and A.21. We do not need to reproduce
its proof again here.
Theorem B.30. Let C R 3 be a curve that starts at the endpoint a and goes to
the endpoint b. Then the line integral of a gradient of a function along C is given by
Z
u dx = u(b) u(a).
(B.78)
C

Since its value only depends upon the endpoints, the line integral of a gradient is
independent of path. In particular, if C is a closed curve, then a = b, and so the endpoint
contributions cancel out:
I
u dx = 0.
C

Conversely, if v is any vector field with the property that its integral around any closed
curve vanishes,
I
v dx = 0,
(B.79)
C

then v = admits a potential. Indeed, as long as the domain is 0connected, one can
construct a potential (x) by integrating over any convenient curve C connecting a fixed
point a to the point x
Z
x

(x) =

v dx.

The proof that this is a well-defined potential is similar to the planar version discussed in
Chapter A.
Example B.31. Line integrals over cylindrical and spherical domains.
If v represents the velocity vector field of a three-dimensional steady state fluid flow,
then its line integral around a closed curve C, namely
I
I
v dx =
v t ds
C

is the integral of the tangential component of the velocity vector field. This represents the
circulation of the fluid around the curve C. In particular, if the circulation line integral is 0
for every closed curve, then the fluid flow will be irrotational because v = 0.
3/7/03

974

c 2003

Peter J. Olver

Stokes Theorem
The second of the three fundamental integration theorems is known as Stokes Theorem. This important result relates the circulation line integral of a vector field around
a closed curve with the integral of its curl over any bounding surface. Stokes Theorem
first appeared in an 1850 letter from Lord Kelvin (William Thompson) written to George
Stokes, who made it into on an undergraduate exam question for the Smith Prize at Cambridge University in England.
Theorem B.32. Let S R 3 be an oriented, bounded surface whose boundary S
consists of one or more piecewise smooth simple closed curves. Let v be a smooth vector
field defined on S. Then
I
ZZ
v dx =
( v) n dS.
(B.80)
S

To make sense of Stokes formula (B.80), we need to assign a consistent orientation to


the surface meaning a choice of unit normal n and to its boundary curve meaning
a direction to go around it. The proper choice is described by the following left hand
rule: If we walk along the boundary S with the normal vector n on S pointing upwards,
then the surface should be on our left hand side; see Figure Stokes . For example, if
T
S { z = 0 } is a planar domain and we choose the upwards normal n = ( 0, 0, 1 ) , then
C should be oriented in the usual, counterclockwise direction. Indeed, in this case, Stokes
Theorem B.32 reduces to Greens Theorem A.25!
Stokes formula (B.80) can be rewritten using the alternative notations (B.21), (B.47),
for surface and line integrals in the form
I
u dx + v dy + w dz =
S

ZZ
(B.81)
u
w v
u w
v

dy dz +

dz dx +

dx dy.
y
z
z
x
x y
S
Recall that a closed suface is one without boundary: S = . In this case, the left
hand side of Stokes formula (B.80) is zero, and we find that integrals of curls vanish on
closed surfaces.
ZZ
Proposition B.33. If the vector field v = w is a curl, then
v n dS = 0
S

for every closed surface S.

Thus, every curl vector field defines a surface-independent integral.


Example B.34. Let S = { x + y + z = 1, x > 0, y > 0, z > 0 } denote the triangular surface considered in Example B.15. Its boundary S = Lx Ly Lz is a triangle
composed of three line segments
Lx = { x = 0, y + z = 1, y 0, z 0 },
Ly = { y = 0, x + z = 1, x 0, z 0 },

Lz = { z = 0, x + y = 1, x 0, y 0 }.

3/7/03

975

c 2003

Peter J. Olver

To compute the line integral

v dx =
T

y 2 dx + x z 2 dy
S

of the vector field v = y 2 , x z 2 , 0 , we could proceed directly, but this would require
evaluating three separate integrals over the three sides of the triangle. As an alternative, we
T
can use Stokes formula (B.80), and compute the integral of its curl v = ( 2 y, 2 x z, 0 )
over the triangle, which is
I
ZZ
ZZ
17
,
v dx =
( v) n dS =
2 y dy dz + 2 x z dz dx =
12
S
S
S
where this particular surface integral was already computed in Example B.15.

We remark that Stokes Theorem B.32 is consistent with Theorem B.28. Suppose
that v is a curl-free vector field, so v = 0, which is defined on a 1connected domain
R 3 . Since every simple (unknotted) closed curve C bounds a surface, C = S,
with S also contained inside the domain, then, Stokes formula (B.80) implies
I
ZZ
v dx =
( v) n dS = 0.
C

Since this happens for every C , then the path-independence condition (B.79) is
satisfied, and hence v = admits a potential.
Example B.35. The Newtonian gravitational force field
T

( x, y, z )
x
= 2
v(x) =
3
kxk
(x + y 2 + z 2 )3/2

is well defined on = R 3 \ {0}, and is divergence-free: div v 0. Nevertheless, this vector


field does not admit a vector potential. Indeed, on the sphere Sa = { k x k = a } of radius
a, the unit normal vector at a point x Sa is n = x/k x k. Therefore,
ZZ
ZZ
ZZ
ZZ
x
1
x
1
v n dS =

dS =
dS = 2
dS = 4 ,
3 kxk
2
a
Sa
Sa k x k
Sa k x k
Sa

since Sa has surface area 4 a2 . Note that this result is independent of the radius of the
sphere. If v = w, this would contradict Proposition B.33.
The problem is, of course, that the domain is not 2connected, and so Theorem B.28
does not apply. However, it would apply to the vector field v on any 2connected sube = R 3 \ { x = y = 0, z 0 } obtained by omitting the
domain, for example the domain
negative z-axis. Exercise asks you to constrct a vector potential in this case.
We further note that v is curl free: v 0. Since the domain of definition is
1connected, Theorem B.28 tells us that v admita a scalar potential the Newtonian
gravitational potential. Indeed, k x k1 = v, as the reader can check.

It suffices to know this for unknotted curves to conclude it for arbitrary closed curves.

3/7/03

976

c 2003

Peter J. Olver

The Divergence Theorem


The last of the three fundamental integral theorems is the Divergence Theorem, also
known as Gauss Theorem. This result relates a surface flux integral over a closed surface
to a volume integral over the domain it bounds.
Theorem B.36. Let R 3 be a bounded domain whose boundary consists
of one or more piecewise smooth simple closed surfaces. Let n denote the unit outward
normal to the boundary of . Let v be a smooth vector field defined on and continuous
up to its boundary. Then
ZZ
ZZZ
v n dS =
v dx dy dz.
(B.82)

In terms of the alternative notation (B.47) for surface integrals, the divergence formula (B.82) can be rewritten in the form

ZZ
ZZZ
u v
w
u dy dz + v dz dx + w dx dy =
dx dy dz.
(B.83)
+
+
x y
z
S

Example B.37. Let us compute the surface integral


ZZ
x y dz dx + z dx dy
S

of the vector field v = ( 0, x y, z ) over the sphere S = { k x k = 1 } of radius 1. A direct


evaluation in either graphical or spherical coordinates is not so pleasant. But the divergence
formula (B.83) immediately gives

ZZ
ZZZ
(x y) z
+
dx dy dz
x y dz dx + z dx dy =
y
z
S

ZZZ
ZZZ
ZZZ
=
(x + 1) dx dy dz =
x dx dy dz +
dx dy dz = 43 ,

where = { k x k < 1 } is the unit ball with boundary = S. The final two integrals
are, respectively, the x coordinate of the center of mass of the sphere muttliplied by its
volume, which is clearly 0, plus the volume of the spherical ball.
Example B.38. Suppose v(t, x) is the velocity vector field of a time-dependent
fluid flow. Let (t, x)Zrepresent
the density of the fluid at time t and position x. Then the
Z
surface flux integral

( v) n dS represents the mass flux of fluid through the surface

S R 3 . In particular, if S = represents a closed surface bounding a domain , then,


by the Divergence Theorem B.36,
ZZ
ZZZ
( v) n dS =
( v) dx dy dz

3/7/03

977

c 2003

Peter J. Olver

represents the net mass flux out of the domain at time t. On the other hand, this must
equal the rate of change of mass in the domain, namely
ZZZ
ZZZ

dx dy dz,
dx dy dz =
t

t
the minus sign coming from the fact that we are measuring net mass loss due to outflow.
Equating these twom, we discover that

ZZZ

+ ( v) dx dy dz = 0
t

for every domai occupied by the fluid. Since the domain is arbitrary, this can only happen
if the integrand vanishes, and hence

+ ( v) = 0.
(B.84)
t
The latter is the basic continuity equation of fluid mechanics, which takes the form of a
conservation law.
For a steady state fluid flow, the left hand side of the divergence formula (B.82)
measures the fluid flux through the boundary of the domain , while the left hand side
integrates the divergence over the domain . As a consequence, the divergence must
represent the net local change in fluid volume at a point under the flow. In particular, if
v = 0, then there is no net flux, and the fluid flow is incompressible.
The Divergence Theorem B.36 is also consistent with Theorem B.28. Let v is a
divergence-free vector field, v = 0, defined on a 2connected domain R 3 . Every
simple closed surface S bounds a subdomain, so S = D, with D also contained
inside the domain of definition of v. Then, by the divergence formula (B.82),
ZZ
ZZZ
v n dS =
v dx dy dz = 0.
S

Therefore, by Theorem B.28, v = w admits a vector potential.


Remark : The proof of all three of the fundamental integral theorems, can, in fact, be
reduced to the Fundamental Theorem of (one-variable) Calculus. They are, in fact, all
special cases of the general Stokes Theorem, which forms the foundation of the profound
theory of integration on manifolds, [2, 17, 48]. Stokes Theorem has deep and beautiful
connections with topology and is of fundamental importance in modern mathematics
and physics. However, the full ramifications lie beyond the scope of this introductory text.

3/7/03

978

c 2003

Peter J. Olver


   !"# %$'&)( !+*  ,+-.0/213-54!63798:-4!.0/24+;3<5=?>A@B-5.0CD1E/F-4G>IH'CJ<51K/F-54!6L<4+M1KNO;
P 4!@B;'=Q6R;,+S3<1E13;'=Q/24!TVUW=<54!6KXY-=QZ:+[\  ]
[\^_`a!$'^cb d^Y( 5e
 !fO  +gh%i"j%kc
$!l^
! fW^Y$'( onp*q$'^((+"#kc!$'
l^Y+gi%ig
m

r
$e!ks+tI +%$'(
^!+ uA +%
tId a
v#xwy<54!/zXY-.EM5637IU;'4!6R-5=?{j4+<.0|R6Q/2637}<4+M{9~~O.0/FS3<1K/`-4!6
]!
$' !l^$%}^Y$' lbI^F I $&)gi%

$ks    +%
]^la!
ox:<54M%3--5V-`X}wy<1EN;'Z<51K/FS3<.JWCD4+S'1E/F-4!6+b5 %
 a!$'^a
9]
!$'!(!
+Gde
]
^Y$' ^('s%
m

 ]
? f)*A$' Y !ls_^W(e!!ld !+"# 

gi%
e
2 $'([\#j-ZI~O.K;F{#4+<5.0|63/26_?$' #OI b^` I $'&B+g%i
 $n  Le!^c^
( onp 9  lee
^c!^ le) $'e! ) )G 9_a!( _#U!=<4!6R9j<5Z'=d/FM5TB;
N%/2.K,+-SWg%'%idWh%

&) !t} _Qe!$'!('*  #qCJ<54!1K/21<1E/2@B;,+;'/263Z-.E-'TY|RW +W$^^ks%


+]!GW$'!_(_ gi%
  l B )
 v#]!a
^Y$'+v#  
} %$'&^ #NO<-56R{j4

P 4!1K=5-MCS'1K/`-4G1-c|4+<5Z/FS3<5.

,!|6313;'Z:6]
!$' 
l^Y$K}^Y$' lbI^F I $&)gii%
YkL
]
 ]
 x8:-54!./4+;Q<5=

=-'.E;'Z:6L-`X}>A.E<631K/FSF/21K|R+!
+Gde
]
_Y !fO  +g%

]!
$' !l^$%}^Y$' lbI^F I $&)gii%
 ( % v#  jj<.ES'CB.0CD6+j %(
^Y  *Aa!
(e!!l"#  ! e
kL+%('( +g%idOi
J ( % v#  +/4;3<5=?{j.0TB;'=<d+ e!L ^Fnp]
!('o
_Yb^` I $&)gii%
J%  G!ks^(t}  +!^c
e!(^cks Ba
$^((ks 
 ks ^$'( 9^!^$'ldnx{j=S'NO<513
wy;3S'N{#4+<5.K+%Wgi%'gdW%
Jr5_Qe
^ $+? x{j4

P 4!1K=-YMCS'1K/`-4G13-x.0CD/`Mc|4+<5Z/`S'6"#kc!$'
l^
m

! f+*q$'^(('"#kc!$'
l^

gi%
J^Y$'^(E! +%
L Y^$
 *aRnl^
(
$' !_! ^c
G('^!$'5 % fD$%
^(;'@Bwy<1KNO

N%|6R%%'g%idWg%

J $'&Be
 xI|
M=-YM|4+<5Z:/FS'6L{,!1KCM5|/24G-'T/FS'7x<%S'1<54+Mc,!/2Z:/2.0/21KCM%;Fg(E^
 +*q$' 
_^` %
! f+*A$^(('*A$' 
_Y^` !+gi%
m

J $'&Be
 
Gt} d%+? W"##=M5/24+<=Q|/z;F=;'4!1K/F<5.J>H'C<1K/`-4!6  ('!^ +*Aa!
"j %
 e
kL+%('( +gi%
J !_Qe
%$'
* +^`fD!^`nt} [\ +! +? t} x/z;F=;'4!1K/F<5.J>H'C<1K/`-4!6$' B &)(EW"#  ^c*Aa!

"#  *x_
_c?$' 3fW^"#  2 +g%ii
J 5'+tI!va
[\  /z;'=;'4!1K/F<5.Jx-5=dZ:6/24G{j.0TB;'=<5/FSU+-`~-.E-'T|]!
$' 
l%^$%}^$ %l+b^F
I $'&B+g%i
J %a
((' !^('B+ +v#eA^
$^c!^( %

^Y(^`!^($^ks a!()a!+(^c!$' !l^: ^c !l


a
G_!
$'^Y_`!la
 $'^ce! $'   +^Y_ ksk:a

 )a!:%a Ba

^c_Y Y^Y)a!
(_^c_! 
^(
f ^((^((^!(' ! ^kL^Y
%$'^ ^(
^c (a
$%_^ca2 !
xwy<51KNO

CB=;'6{9~%~.E

g%'5Wg%
J %a
((' !^('B+ +uq('(('a!$ ?e9^
%$' ^c
^(^ap_ a!$'('xw;' Z{S3<M%,+S'/F

P 4!63138:<513W=<4+S3;

Og%g%'+g5W%

BJDBD

DD

5%R'K' 5F

rJ 3n_^WuA ! *A$ks%+tI"# x>9.K;FZV;'4!13<5=d|L/z;'=;'4!1E/F<.J>IH'CJ<51K/F-54!6L<4+M-5CD4+M%<=Q|<.0C;


=-%'.K;FZ6+5e^! e
 ^`np]
%
(+o!_ +b^` I $'&B+%g

rJ$l%e
kL+uA  jUN;<631qx-5CD=Q/F;'=UW=<54!6X-=dZc+*q$'^_^FW o
_YuA
l% ^F B )G"j  O(+b 
gi%h
rJr$  
  +%
"#e)a!$'_Qe! +tI  x-CB=d/F;F=?,+;'=Q/F;'6L<54+Mc-5CB4+M<5=d|<5.C;

=-'.E;'Z:6

G_$ #3 +b^` I $'&B+g%ii


rJa!$'!^
t} [\ +%
W $'^Y('x8ICDZ;'=Q/FS3<.J{j4+<.0|R6Q/26+]!^`fW^euA!  !+j$' ) %&)("#  ^*x_
_
?$' fO^+"#%g
rJc!
a $'l (('^Y$'+* "# a!(^
 
]!e
%&)$' %  e!  {#.0TB;F=</FS:j-ZI~O.K;F/21K|UxNO;3-=Q|R
]!
$' !l^$%}^Y$' lbI^F I $&)gii%
rJ"#e
%$$'%

?![\^(!&B+[\ #=<o~N%6L/2T=<o~N%63+$^!"#e
!ksG%  [\ 
! !
gi%i
rJ"# )!
 
l5 
uA ![\^`f!( 
b#UN;3-5=d|-`Xc=M5/24+<=Q|/z;'=;'4!1E/F<.J>HFCJ<51K/F-54!6
G_$ #W  b^` } %$'&)gi%
rJ"#  ^% Ba
( !^$!$'%O   _c^)a!d  B__a
$$' 
l %^$' )n!ks _(j{9~%~.Ewy<1KNO

gig%'+%dO

rJ"# ) % ^`n)+  


Gv+a!&^`n  +G l $' e!k %$Ae
^cks%_Qe
 
^c_Y ks
ad G _Y ks! ^`
W a!$' ^$(^$' ^(wy<1EN9j-5ZI~
rJ"# !(' %
uA vj 

gi%'idOg

<5=d1E/F<.J/z;'=;F4W1E/F<.>H'CJ<51K/F-54!6"#kc!$'
l^
m

! fO^$'( ony*A$'^((+"#kc!$'


l^Y

gi%
J"# a!$'+t} x/z;'=;'4!1E/F<.q<54+M

P 4!13;KT=<5.qj<.ES'CB.0CD6+%
^Y
 +o^$'(_ ^
_^c*Aa!
 +b^` I $'&B

gi%
J"# a!$'+t} ! ^$ wy;'1KNO-M6L-oXIwy<51KN;FZV<51K/FS3<5.

N|R6Q/`S'6WfW % o+o^$'(_ ^


_Y^*qa
!  

N|R6Q/`S'6WfW % 'o+o^$'(_ ^


_Y^*qa
!  

b^` I $'&B+gi%
Jr"# a!$'+t} ! ^$ wy;'1KNO-M6L-oXIwy<51KN;FZV<51K/FS3<5.
b^` I $'&B+gi%
J"#$'  ^  {I/26313-5=d|-`X;3S'13-5=?{j4+<.0|R6Q/63+ fW^$*qa
!  b^` I $&)gi%
Ja
^_Qe!^Y('o$e! 
%$'ks 
%('^( 9_ ks!_` np(a
!O %$^L fW^Y ^F('#j-5ZZ:CB4+

CB=;{9~%~.E

wy<1ENWpgi%'+i%idOii
Ja
^_Qe!^Y('o#U+;'4Gx;3S'1KCB=;'6s-4G<@B;F.K;F1K6]
o*Ae! !^ 
e!%+*gii%
J^`fD!^`nt} [\x{j4

P 4!1K=-YMCS'1K/F-54G13-NO<-51K/FS|4+<5Z/`SQ<5.J,!|6313;'Z:6I!
 ( R%^( ^`n+tI^ B )

"# on+"#%  2gi%i


J^` 
!^`n+  jUN;

.K<54!/2@B;'=Q6R;9j-5Z~OCB13;'=j-4!13<%S'1q9/21EN

"# ^$'!_Ya
(+b^` I $'&B+%g
J$_*  #UNO;

=Q/24+S'/z~O.K;F6L-`X:qCJ<54!1KCBZ

<U}-

M/2Z;'4!63/F-54<5.

-5=d.EMd

wy;3S'N<54!/FS'6vje! $'uA!  %


"j $^
! *q$'^(('

%$'
gih%

J[\*A$

^_`

J! "j%$'ks  * x/z;F=;'4!1K/F<5.q;3-5ZV;F1K=d|-`XcCD=Q@B;'6L<54+Mc,!CB=XY<%S3;'6*A$'^ _^`3 uA!l ^` ) )


"# (+b gi%



J$' 
*  ! e
!(' %
t} ]
 ,-5.0/213-54!6 {#4

P 4!1K=-M5CS'1K/F-54
"#kc!$'!l^
m

! fO^$'( ony*A$'^((

"#kc!$'
l%^+g%ii
Jrnks!_^Y
* x-CB=d/F;F=?,+;'=Q/F;'6L<54M

P 4W1;KT=<.06I_Y
^Yks_c*q$'^(('b^` } %$'&)

gi%

a
%  d fW^ca!
 fW^Y$'('%  onp2 $_Y %('( 9
! !^$A$
(2 $'ks5 %
(

JW^ l^Y)
%a
ks 
,!13<513

BJDBD

N%|6R

gi%'dW%

DJ

5%R'K' 5F

JW^$ksuA +*(E+ 

tI^O %$[\

ks]!]Ra!
 ^( 9
!  !^$!$' ! ^Yks('o!$'^
$+[\ %( ks (


x8:-54!./24+;3<5=

gih%+g%i  

G<5@B;wy-51K/F-54
 "jb^` ^ +^! +[\^_Fa
$'^Y( 

!
^Y5e
 !fO  g%+ks^Y$' _Gde
]
B_*A$ f
^Y
_^t} o +g%ih
!+g%hdWg%
Jx ^
  #UNO;

P 4!@B;'4!1K/F-54-`X

wy<1EN;'Z<51K/FS'6L<54+Mc{j=d1q/24G1KNO;;F4<5/2636R<4+S3;F+ %$'

P 4 +4!/21K|

! fW^Y$'( onp*q$'^((+2 $


gii%
m

Jx ^`_Qe!^$'b 


Gt} ((' !l+v# #UNO;

N%|63/FS'6L-oXIwVCB6Q/`SQ<5.

P 4!631K=QCDZ;'4!1K6+]!^_ !uA!  !

]!
$' !l^$%}^Y$' lbI^F I $&)gii%
Jx 
^Y+ !t} (^BO^$l^$+?#UNO;OCB4+M<5Z;'4!13<5.qUxNO;3-=5;'Z

-oXI{j.0TB;'=<d
m

!
^Y$'l$'%
a!d^

v^`)( de!^ks5_(]
!$' 
l^Y$I^$' l%+b^` I $'&)gii%
Jx ^ks !l! xOCB4+S'1K/`-4!6L-oXI,+;'@B;'=<5.<5=d/F<%'.E;'6^! +]!
$' !l^$%}^$ %l+b^` I $'&B+g%i
JW $(nBe
I tI#UNO;UN;3-5=d|-`X}/z;'=;'4!1K/F<5.J>H'C<1E/F-4!6+"#%kc
$' !l^
m


 fW^$(' onp*A$^(('

"#kc!$'
l%^+g%igi%gi%gi%
JW a!$' ^$'#UNO;{j4+<.0|1E/FS3<.qUNO;Q-5=d|-`X}:;Q<51E+ fW^$*Aa
!  b^` I $&)gi%



JW$'%
_ (+  x vje!^

$'!('2 $ksd o'ojj-ZI~CB13Wgig5W'5Wg%+%dW%h

Jr? [\ j.E<6363/FS3<.q<.E-5/261KNO;Q-5=d|hde^Y


 +"#e!^ (^*qa
! "#  +b^` I $'&B+g%i
J? ) B
^]
 x/z;'=;'4!1E/F<.J>IH'CJ<51K/F-54!6L<4+M\/24+;Q<5={#.0TB;F=<5+]!^_ !uA!*A$^Y _^c 
m

!O^$

]!
! ^ct} fW^$'b+%
J?^ 2%

o  +%


W %ks!+]! #j<5.ES'CD.0CB6L-oX<5=d/F<51K/F-54!63+*A$^Y _^`O o
_ uA
l% ^` B )
"# (+b gi%
J?a
 ^Yks 
W: 
G*x  _Q&) / ;'=;'4!1K/`<.qU+-`~-5.E-'T|R+*A$^Y _^`O o
_ uA
l% ^F B )
"# (+b gi%h
J?a
$E!+Guqx{j4

P 4!1E=-MCS'1E/F-4G1-

j-54!1K/24!CBCDZ

wy;3S'NO<4!/FSF63+_!^ks _c*A$^('(bI^F I $&)gig%

a!$v#e
^ %$' ^c
^$ %$e
%l ! ^Wa
!&Y !^
(En(E^ks^Ywy<51KN{#4!4+W

J%$'+ 

gig%'

%gdW%g
J^$'ks%
t} >A.K;FZV;'4!1<=Q|{9~%~.0/F;3M
 
m

<=Q1K/F<5.J/z;'=;'4!1K/F<5.J>H'C<1E/F-4!6+v#e!$uA!  !+*q$'^_^

!O^Y$]
!
^ct} fW^$+bgii%

J ^ #q=M/24+<5=d|L/ ;'=;'4!1K/F<5.J>H'C<51K/F-54!63+]!^_ !uA!  !+tI+uq+$' ^l%^$*Aa!


"j %
a!Y 
ld %
b  +g%i%
J +tI  +%
 (_
*A !^`$npkL d !Le!^c
a!  onp %2 $_^c  ('x,

P {Iw

;'@/F;'

O%'gg%dWg%h
J +tI  +%
 (_
v#e
^cks5e
^ks5_Y( 9kca!(' _  
(E$'a
kL^Y({jZV;'=wy<51KNwy-4!1EN.0|

%g'h5W%

JrkskL!lt}  x8ICBZ;'=d/FS3<5.Jwy;'1KNO-M6X-=,+S'/F;'4!1K/2631K6L<54+M>A4!T/24+;3;'=d6+G_?$' #W  b^`


I $'&B+g%i
J^!$' _ +* {9~~O./`;QM<54+M

j-5Z~OCB13<1E/F-4+<5.qj-ZI~O.K;':{#4+<5.0|63/26WfW  g%W ^`np]! 


(bI^F

I $'&B+g%ih
J  ^uA j=M/4+<=d|L/z;'=;'4!1K/`<.J>HFCJ<51K/F-54!6/24G1KNO;j-5Z~O.K;Fs-5Z</24
+ e!L ^`ny]! !('
b^` I $'&B+gi%
J $'(_Qe
  
G]
ks% ^]!x/z;'=;F4W1E/F<.>H'CJ<51K/F-54!6379|4+<Z:/FS3<5.J,W|R6313;'Z:637I<54M\/24+;3<=
{j.TB;Y'=<5+_!^ks _c*A$'^Y('(+b^F I $'&B+g%ih
J !(' %
uA  #UNO;UNO;3-=Q|G-oXI,~NO;'=Q/FS3<.q<54M>A.2.0/z~O6R-/FM%<.J:<5=dZ-4!/FSF63+"#e
^Y (^*Aa!
"# 
b^` I $'&B+gi%

#j-5Z:ZCB4+

J !uA vje!^c


%$  +! O^$^Y  ^)a!d 

CB=;{9~%~.Ewy<51KNOW

gi%'%gdW%
JAn! !+* uA ,W|RZZ;'1E=d|Lwy;'1KNO-M6X-=/z;'=;'4!1K/F<5.J>H'C<1E/F-4!6+"#k:
$' !l^cv^`)(GI!

Gde
 "j%kc
$!l^
m


 fW^Y$'(' onp*q$'^(('"#kc
$!l^%

Jo!_^+uq[\ #q=M/24+<5=d|L/z;'=;'4!1K/F<5.J>H'C<51K/F-54W6+ fW^Y$*Aa


! b^` } %$'&)gi%

BJDBD

DD

5%R'K' 5F

Jo(_( !+uA !^ ^Y$'+   {#4+<5.|R63/26L-`X}8ICDZ;'=d/`SQ<5.Jwy;'1KNO-M56 e


 ^`np]! 
(+b^F
I $'&B+g%i

!-63CB4!T63ZV;'1EN-YM;'4GCB4+M-563CB4WT);'4
WfW  g"je!^ ('^Y+b^F

Jks&%^uAx/ ;'=;'4!1K/`<.0T.E;'/FS'N%CD4!T);'4G
I $'&B+g%ig

JrkskL ^Y$'+  x{/=d631j-5CB=d6R;/24G-CB=d/F;F=?{j4+<.0|R6Q/63+*A$^Y _^c 


m

!O^Y$]
!
^ct} fW^$

b+%%
J! %
(&Yno {j4

P 4!1K=-YMCS'1K/F-54G13-/z;'=;'4!1K/`<.J{j.0TB;'=<d
G^
 I^Y$'ks!
*x$' (+g%i

"4+-51K6L<54M

Ja!Oks%
[\ I 
J^^Y
^$+* 

N%|63/FS'6+%
^Y
 ! $' ]!_ ^Y
_]
 !l $'^gi%i

=Q/24+S'/z~O.K;F6L-`XI{9~%~.0/F;3Mwy<1EN;'Z<51K/FS'6R9U!=<4!6KXY-5=dZ<1E/F-4<54+M{9~~O=-/ZV<51K/F-54


!
 (' %R^( ^Fnp*Aa
! +"#  bI^F I $&)gi%
J^ks^YYn!]
!^ xx/24!/213;Iwy<=QB-@N<5/24!6]
!$' 
l%^$I^$ %l+b^` I $'&B+g%i
J^`fO $'&) !% 

<5=d1E/F<.J/z;'=;F4!1K/F<5.J>H'C<1K/`-4!6]
^_Y 
GuA
  
v^`)(GI!
 ^

Gde
^kLd _('WfW  ]
!$' 
l%^$I^$ %l+b^` I $'&B+%
J $E^` ^l !
^}I$' ^( Le
^c_Qe!!l^c 92 $'k 9 
l 3fW^(Rf

_ !lG
$'^Y_`!la
$_Qe!!
^ 
G !^` onO^c % !l(Ed 
%$n fW^(

N%/2.Ewy<5TB}

g%i'h%dWhh%
J$'%  Gx\/24+;3<=wy;'1KNO-M56L-`XI{9~%~.0/F;3M{#4+<5.0|63/26

 ( ^Y(' ^`nt}^%
 
l%('( +gi%
J$'^Fn(l%+uA x{M5@B<4+S3;3Mc>94WTY/24+;3;'=d/24!Twy<51KNO;'Z<1E/FS'6uA leYeuA
  !+W ^`np]
%
(+b^`
I $'&B+g%ii
Ja!

3f(&nW%! b( ks^c% l% $' e
kL(2 $Ae!^c(' % a  e
^c_Y ks! ^`^c^Yl%^YfD% a!^

$#,!,!

!$' ! ^Yks

j-5Z~OCD1wy<51KNOwy<1EN

Jr[\%

%a
[\ +![\ 2('e! +uqG#C<54W1ECDZ

N%|6RWgi%g'5W%
wy;3S'NO<54W/`S'6sE8:-4

=;'.K<51K/2@Y/2631K/FSUNO;3-=Q|W'+"# a!$'(^

9v#e
^Y $'^` _% +*Aen(_Y('WfW  +*^$'l%ks *q$'^(('b^` } %$'&)gi%
J[\%
2 $'!+ _ %ks
a^$3%('( (^!$' B  %e!^cW^ l^B
a!k_ 

^Y_`a
$'^YCB.2.E{jZV;F=wy<51KNO

,+-SWgi%'h5Whh%
J%
!^ 
$' 5+  #UNO;O=5<S'1<.q;Q-5Z;'1K=d|-oXI8:<1ECD=;`WW$'^^kL
b^` } %$'&)gi%
J%$'(
^!+ uA +%
v$' %kc
 % ;3S'13-5=j<5.KSFCD.0CB6hde^Y
 ! +W$^^ks!+b^` I $'&B
gi%i
J^Y('( e
#C<4!1ECDZ

wy;3S'NO<4!/FSF63+ e!L ^`np]! !('b^` } $&)gi%

J ^$'W %$x,!|Z:Z;'1K=d|<54+Mc,+;~<=<51K/F-54-`X

<5=d/F<%'.E;'6uA!_`n_ ^
  %Gde
^Yk}_(!

(I!
 _d !('WfW  hI!
 ( ^( ^`np*Aa!
+"# %t}^Y
 
l%+G((' +g%i%
J (
^Y$'+"#  v#e
$
^]! +!Le
^Y^ ^$'+ #=<5@/21<1E/F-4
W +W$^^ks!+]!
W$'%
_ (_ gi%
J B !+x "##N<%-51K/FSV/F'=<51K/F-54!6 e
 ^`np]
%
(+b^` I $'&B+g%i
J B !+* +%
]!O^Y
_^Y$'+uqxx/F;'.EMUN;3-5=d|L:<4+M%Q-Y-Y+]!
$' !l^$KI^$ l%+b^` I $'&B+gi%g

%9<;KT/24!4+;'=%& 6LCB/FM%;FI_Y
^Yks_c*q$'^(('b^` } %$'&)

J %$'l%
x#;Q-5Z;'1K=d/`Swy;3<63CB=;UNO;3-=Q|
%

Jr %$'(^+*  !W^(e)!_Qe


xwy;'1KNO-M56L-`XcUN;3-5=;'1K/FS3<5.

N|R6Q/`S'6_Y$ #W bI^F I $&)

gi%
Jb ('_Qe!^% "# "j xx;3S'1ECD=;F6L-4GwV/4W/ZV<5.J,!CD=XY<S3;'6+"#k:
$'
l%^
m


 f*A$'^Y('(+"#kc!$'
l%^+g%i

J^$'e!^`' 
l^Y$'+x #U+<F.K;'6L-oXI-CB=d/`;'=U!=<4!6KXY-5=dZ:6L<4+Mx-5CB=d/F;'=UW=<54!6KXY-=QZ6L-oXI/2631K=d/`'CD1E/F-4!6
]!
$' !l^$KI^$' l%+b^` I $'&B+gi%i
J^$'e!^`' 
l^Y$'+x 
G! [\#U<%'.E;'6L-oXIx<o~.E<%SQ;:UW=<54!6KXY-=QZ6+]!
$!l^$EEI^$ %l+b^`
I $'&B+g%i
J fW^$+x  {63|RZ~O13-51K/FS'6L<54+Mc,~;QSF/F<.JOCB4+S'1E/F-4!6I_Y
^Yks_c*q$'^(('b^` } %$'&)gi%h

BJDBD

DD

5%R'K' 5F

J fW^$+* x{9~%~.0/FS3<1E/F-4!6L-oXI\/F;=-C~O613-/z;'=;'4!1K/F<5.J>H'C<1E/F-4!6+%


^Y
 +?$'%
a!d^cv^`)(
 Gde
^Yksd _('WfW % g%+]!
$!l^$%}^Y$' lb^` I $&)gii%
J b^ * x{M@B<54+S3;3Mc>A4!T/4;3;'=Q/24!Twy<51KNO;'ZV<51K/FSF63+W a!$euq
  %
W!(E %$e*Aa!
"j %
^Y kL Y"# +gi%i

('

J$^Yl% Gx8ICBZV;F=d/FS3<.J{j4+<5.|R63/26 9{,+;3SQ-54+Mj-CB=d6R;F+_%


^ks _c*A$^('(bI^F I $&)gi%



\J*x^Y l%^
 3 +%
Gt} _QeY^$+* #UNO;;3<5CD1K|-oXIO=<%S'13<5.06

P Z<T);'6L-`Xcj-ZI~.E;'|4+<5Z/`SQ<5.

,!|6313;'Z:6 ]
!$' 
l%^$I^$ l%+b^` I $'&B+gi%
\J*A$^(('W  +v^a!&  ('&n+]! W}^F'^$'  
lWv# +%
 !
^Y$n *x8ICBZV;'=Q/FS3<.JI;QSF/z~;'6/24

) #UNO;{j=d1-`XI,+S'/`;'4!1K/* Sj-5ZI~CB1K/24!TY+!^! +"#kc!$'


l^

! fW^$'( onp*A$'^Y('(+"#kc!$'


l%^

gi%i
\Jrt}^Y^
 +%
G]
 ks !+ xwy;'1EN-YM6L-oXIwy-M%;'=d4Gwy<51KNO;'ZV<51K/FS3<5.

N|R63/FS'6_!^ks _c*A$'^((+b^`

I $'&B+g%i
\Jt}^Y
$Rn +%
tI l^Y$'(+t} "# {#4

P 4!1K=-M5CS'1K/F-54G13

<5=d1K/F<5.J/ ;'=;'4!1K/F<5.J>H'C<51K/F-54W6

_!^ks _c*A$^('(bI^F I $&)gii%

+

\Jt} _Qe!$'!('o +!LI a


!I #UNO;3-=Q|G-oXI/6Q1E=d/F'CB1K/`-4!6 9<8:-4
"#kc!$'
l%^
m


 fW^$'( onp*A$'^Y('(+"#k:
$' !l^gii%

U;3S'N%4!/FS3<.

P 4W1E=-M5CJS'1E/F-4


\Jt} 3n
^Y
[\ x;3<5.J{#4+<5.0|63/26_Yks "# %b^` } %$'&)gi%
\Jt}a!
 !!

=Q/24+S'/z~.E;'6L-oXIwy<51KNO;'ZV<51K/FS3<5.J{#4+<5.0|63/26_?$' #OI +b^` I $'&B+g%i

\J]!
 $' %+? j;3-5ZV;F1K=d/FS

<5=d1K/F<5.J/ ;'=;'4!1K/F<5.J>H'C<51K/F-54!6<54+M

P Z<TB;I{j4<5.0|63/26"#kc
$!l^
m


 f

*A$^(('"#kc
$!l^%g
\J]!_QeY $Y+[\ jUN;
-5=d/F;:M;'6LM5/2631K=Q/F'CB1K/F-54!63+^$ks!
+*$' (+gi%

,.- x;~-=Q1-oXI1KNO;0/21 ( 3

\J]!_ d't}a!('(^ + +L fO^(

wy;3;'1K/4WT$  (e('( )_Y+Rf+]!_ gh%

!
ggdOi
\J]!^('e!!$' +t} 
Gbvj  j=-5C~

P 4!@B<=Q/F<4+S3;/4G>A4!T/24+;3;'=d/24!T:-5CD4+M%<=Q|<.0C;

=-%'.E;'Z6

]!
$' !l^$%}^Y$' lbI^F I $&)gi%
\J]!^`e
%$'^(! #U!CB4W/4WTY7IU/2Z'=;'79,~;QSF1K=dCBZ:79,+S3<.E;F+]!
$!l^$EI^$' lb^` } %$'&)gii%i
\Jr]$'!l+? 

P 4!1K=-M5CS'1K/F-54G13-{9~~O.0/F;3Mcwy<51KNO;'Z<1K/`S'6W^Y  ^( ^`np"#kc!$'


l^c*q$'^(('W^Y  ^( ^`n

G('( +g%i
\J]$'!l+? +/24+;3<5=?{j.0TB;'=<<54+Mc/21E6{9~~O./`SQ<51K/F-54!6v#e
 $'uq
 +%$'_ a!$$'_Y^+ f

f_de

]!^Yl gi%
\J]$'!l+? jj<.ES'CB.CB6!^ ^(' ^`np"#%kc
$' !l^c*A$'^Y('(!^ ^Y(' ^`n((gi%ig



\J]$'!l+? !fW^Y ^FI$


(2 $'kL(AfW^$('a!(W a!$' ^$A$'%
(2 $'ks(CB.2.K{#Z;'=wy<51KNO,+-SW
gi%i'%dW%
\J]$'!l+? +%
j $'$^+ x\/24+;3<5=?{j.0TB;'=<57I;Q-YM;'63|R7I<4+M

,
W^  ^( ^`np"#kc!$'
l^c*q$'^(('

^ ^( ^`n((gii%
\J]$'!l+? +%
bl%aRnW^Y
vj G<@);'.E;'1K6L<54+Mcx/2.013;'=<54!6W^ ^( ^Fnp"#kc
$!l^c*A$^('(
^ ^( ^`n((gii%
\Jv# _Qe
ks%$'(e
uA+"# jUxNO;3-=Q|G-oXIWCD4+S'1E/F-4!62 $'
\Jv\n_Qe! 
3fb +%
G]
ks%$'(&) + 

! fW^$'( onp*A$'^Y('(+[\ %


! 
gi%

<5=d1K/F<5.J/z;'=;'4!1K/`<.J>H'C<51K/F-54!6L-`XIwy<51KNO;'Z<1K/`SQ<5.

N%|63/FS'6 
^Wn+]!W$
_Y(_ gi%h

\rJd&) 
(+ ]
 
\rJd( 
?b {

4

!
^$(%
!!l?e
^

 l $ e!ksx,

P {Iw

;'@/F;'Wgi%'hdOhh%

UW=;3<51K/26R;-54G1KNO;UNO;Q-5=d|-`X};'636R;'.JOCB4+S'1K/F-54!6"#kc!$'
l^
m

! f+*q$'^(('

"#kc!$'
l%^+g%i
\rJr^ )O^Y$'l^$+ x {x/2=d631j-CB=d6R;/4

<=Q1K/F<5.J/z;'=;'4!1K/F<5.J>H'C<1E/F-4!6+? 
G
G"j %

 e
kL+%('( +gi%
\rJe
 e
%ks+?  +/24+;3<5=<54+Mc8:-54!.0/24+;Q<5=

BJDBD

G<5@B;'6% %e
L ^`np]
%
(+b^` I $'&)gi%h

65

5%R'K' 5F

\rJe
 '&%^$'uAv# x{
"#kc!$'
l%^
m

U!=;3<1E/26R;-54G1KN;I{#4+<5.0|1K/FS3<5.J|R4+<Z:/FS'6L-oX

<5=d1K/`S'.K;F6L<4+M/2T/FM-YM/F;F63


 f*A$'^((+"#kc!$'
l^Y+g%i

\rJe
 '&%^$'uAv# 
d( !+? bx{

j-5CD=Q6R;-oXIwy-M%;'=d4G{j4+<.0|63/26+"#k:
$'
l%^
m


 f

*A$^(('"#kc
$!l^gii%
\rJ!
^Y$'+ : #UNO;:;3<1q>H'C<1E/F-4
+_%
^ks _c*A$^(('b^` I $&)gi%
\rJ  $ks]
 jUN;Iwy<1KNO;'Z<1E/FS3<c--Y+v#e
 $'GuA
  
"#kc!$'
l^
m

! fW^$'( onp*A$'^Y('(

"#kc!$'
l%^+g%ii
\rJI a!
l  

P 13;'=<51K/2@B;I,+-.0CB1K/F-54-`X}x<%<=`TB;I+/4;3<5=?,!|R6313;'Z:63+_!^ks _c*A$'^Y('(+b^` I $'&B

gi%g

J
a!(&Yn+b  +%
$a
(& +G +o^$'_F % 87d(    !(:9G_  ( 
^((
(ks!
e!^c$'^_a!$'$'^Y
_^c 9 
 % +(Ed^Y(' N%|6R;F@B;'1K13+<
; gi%'+%hdW%h
\JJ
  ('^Ff! x 
G*x  nW!!+ x:<54+M3--5V-oXI>AB<%S'1,+-5.CB1K/F-54!6XY-5==5M/24+<5=d|L/z;'=;'4!1K/F<5.
\rJ

>H'C<1E/F-4!6"jtj"*q$'^((' B_t}5 
x gi%i

D  !l^$+x:<54+M3--5V-`X}/z;'=;'4!1K/F<5.J>H'C<1E/F-54W6+_%


^ks _c*A$^((' (E !+gi%i
Dnlkca!
!+ #U!=d/2T)-4+-5ZV;'1E=d/FSI,+;'=d/F;'6+"#kc!$'
l%^ m ! fW^$'( onp*A$'^Y('(+"#k:
$' !l^+g%i%

\J
\Jr

BJDBD

6=

5%R'K' 5F

You might also like