\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Cluster Analysis}
%%\VignetteDepends{scatterplot3d,mclust,mvtnorm,lattice}
\setcounter{chapter}{20}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
%% lower png resolution for vignettes
\SweaveOpts{resolution = 100}
<>=
library("mclust")
library("mvtnorm")
mai <- par("mai")
options(SweaveHooks = list(rmai = function() { par(mai = mai * c(1,1,1,2))}))
data("pottery", package = "HSAUR3")
@
\chapter[Cluster Analysis]{Cluster Analysis: Classifying Romano-British Pottery and
Exoplanets \label{CA}}
\section{Introduction}
\section{Cluster Analysis}
\section{Analysis Using \R{}}
\subsection{Classifying Romano-British Pottery}
We start our analysis with computing the dissimilarity matrix
containing the Euclidean distance of the chemical measurements
on all $\Sexpr{nrow(pottery)}$ pots. The resulting
$\Sexpr{nrow(pottery)} \times \Sexpr{nrow(pottery)}$ matrix
can be inspected by an \stress{image plot}, here obtained from
\index{Image plot}
function \Rcmd{levelplot} available in package \Rpackage{lattice}
\citep{PKG:lattice, HSAUR:Sarkar2008}. Such a plot associates
each cell of the dissimilarity matrix with a color or a gray
value. We choose a very dark grey for cells with distance zero (i.e.,
the diagonal elements of the dissimilarity matrix) and pale values
for cells with greater Euclidean distance. Figure~\ref{CA-pottery-distplot}
leads to the impression that there are at least three distinct groups
with small inter-cluster differences (the dark rectangles) whereas
much larger distances can be observed for all other cells.
\begin{figure}
\begin{center}
<>=
pottery_dist <- dist(pottery[, colnames(pottery) != "kiln"])
library("lattice")
levelplot(as.matrix(pottery_dist), xlab = "Pot Number",
ylab = "Pot Number")
@
<>=
trellis.par.set(standard.theme(color = FALSE))
plot(levelplot(as.matrix(pottery_dist), xlab = "Pot Number", ylab = "Pot Number"))
@
\caption{Image plot of the dissimilarity matrix of the \Robject{pottery} data.
\label{CA-pottery-distplot}}
\end{center}
\end{figure}
We now construct three series of partitions using single, complete, and
average linkage hierarchical clustering as introduced in Subsections~\ref{CA:HC} and \ref{CA:diss}. The function \Rcmd{hclust}
performs all three procedures based on the
dissimilarity matrix of the data; its \Rcmd{method} argument is used
to specify how the distance between two clusters is assessed. The
corresponding \Rcmd{plot} method draws a dendrogram; the code and
results are given in Figure~\ref{CA-pottery-hclust}. Again,
all three dendrograms lead to the impression that three clusters
fit the data best (although this judgement is very informal).
\begin{figure}
\begin{center}
<>=
pottery_single <- hclust(pottery_dist, method = "single")
pottery_complete <- hclust(pottery_dist, method = "complete")
pottery_average <- hclust(pottery_dist, method = "average")
layout(matrix(1:3, ncol = 3))
plot(pottery_single, main = "Single Linkage",
sub = "", xlab = "")
plot(pottery_complete, main = "Complete Linkage",
sub = "", xlab = "")
plot(pottery_average, main = "Average Linkage",
sub = "", xlab = "")
@
\caption{Hierarchical clustering of \Robject{pottery} data and resulting dendrograms.
\label{CA-pottery-hclust}}
\end{center}
\end{figure}
From the \Robject{pottery\_average} object representing
the average linkage hierarchical clustering, we derive
the three-cluster solution by cutting the dendrogram at a
height of four (which, based on the right display in
Figure~\ref{CA-pottery-hclust}
leads to a partition of the data into three groups). Our interest
is now a comparison with the kiln sites at which the pottery was
found.
<>=
pottery_cluster <- cutree(pottery_average, h = 4)
xtabs(~ pottery_cluster + kiln, data = pottery)
@
The contingency table shows that cluster 1 contains all
pots found at kiln site number one, cluster 2 contains all pots
from kiln sites number two and three, and cluster three collects the
ten pots from kiln sites four and five. In fact, the five
kiln sites are from three different regions defined by one, two and three, and
four and five, so the clusters actually correspond to pots from
three different regions.
\subsection{Classifying Exoplanets}
\begin{figure}
\begin{center}
<>=
data("planets", package = "HSAUR3")
library("scatterplot3d")
scatterplot3d(log(planets$mass), log(planets$period),
log(planets$eccen + ifelse(planets$eccen == 0,
0.001, 0)),
type = "h", angle = 55, pch = 16,
y.ticklabs = seq(0, 10, by = 2),
y.margin.add = 0.1, scale.y = 0.7,
xlab = "log(mass)", ylab = "log(period)",
zlab = "log(eccen)")
@
\caption{3D scatterplot of the logarithms of the three variables available
for each of the exoplanets. \label{CA-planets-scatter}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
rge <- apply(planets, 2, max) - apply(planets, 2, min)
planet.dat <- sweep(planets, 2, rge, FUN = "/")
n <- nrow(planet.dat)
wss <- rep(0, 10)
wss[1] <- (n - 1) * sum(apply(planet.dat, 2, var))
for (i in 2:10)
wss[i] <- sum(kmeans(planet.dat,
centers = i)$withinss)
plot(1:10, wss, type = "b", xlab = "Number of groups",
ylab = "Within groups sum of squares")
@
\caption{Within-cluster sum of squares for different numbers of clusters for
the exoplanet data. \label{CA-planets-ss}}
\end{center}
\end{figure}
Sadly Figure~\ref{CA-planets-ss} gives no completely convincing verdict on
the number of groups we should consider, but using a little imagination `little elbows' can be spotted at the three and five group solutions. %%'
We can find the number of planets in each group
using
<>=
planet_kmeans3 <- kmeans(planet.dat, centers = 3)
table(planet_kmeans3$cluster)
@
The centers of the clusters for the untransformed data can be computed using
a small convenience function
<>=
ccent <- function(cl) {
f <- function(i) colMeans(planets[cl == i,])
x <- sapply(sort(unique(cl)), f)
colnames(x) <- sort(unique(cl))
return(x)
}
@
which, applied to the three-cluster solution obtained by $k$-means gets
<>=
ccent(planet_kmeans3$cluster)
@
@
for the three-cluster solution and, for the five cluster solution using
<>=
planet_kmeans5 <- kmeans(planet.dat, centers = 5)
table(planet_kmeans5$cluster)
ccent(planet_kmeans5$cluster)
@
\subsection{Model-based Clustering in \R{}}
We now proceed to apply model-based clustering to the \Robject{planets} data.
\R{} functions for model-based clustering are available in package
\Rpackage{mclust} \citep{PKG:mclust,HSAUR:FraleyRaftery2002}.
Here we use the \Rcmd{Mclust} function since this
selects both the most appropriate model for the data \stress{and}
the optimal number of groups based on the values of the BIC
computed over several models and a range of
values for number of groups. The necessary code is:
<>=
library("mclust")
planet_mclust <- Mclust(planet.dat)
@
and we first examine a plot of BIC values using the \R{} code
that is displayed on top of Figure~\ref{CA-mclust1}.
In this diagram the different plotting symbols refer
to different model assumptions about the shape of clusters:
\begin{description}
\item[EII] spherical, equal volume,
\item[VII] spherical, unequal volume,
\item[EEI] diagonal, equal volume and shape,
\item[VEI] diagonal, varying volume, equal shape,
\item[EVI] diagonal, equal volume, varying shape,
\item[VVI] diagonal, varying volume and shape,
\item[EEE] ellipsoidal, equal volume, shape, and orientation,
\item[EEV] ellipsoidal, equal volume and equal shape,
\item[VEV] ellipsoidal, equal shape,
\item[VVV] ellipsoidal, varying volume, shape, and orientation
\end{description}
\begin{figure}
\begin{center}
<>=
plot(planet_mclust, planet.dat, what = "BIC", col = "black",
ylab = "-BIC", ylim = c(0, 350))
@
\caption{Plot of BIC values for a variety of models and a range of number of
clusters. \label{CA-mclust1}}
\end{center}
\end{figure}
The BIC selects model VVI (diagonal varying volume and varying shape) with
three clusters as the best solution as can be
seen from the \Rcmd{print} output:
<>=
print(planet_mclust)
@
This solution can be shown graphically as a scatterplot matrix.
The plot is shown in Figure~\ref{CA-planets-mclust-scatter}.
Figure~\ref{CA-planets-mclust-scatterclust} depicts the clustering solution
in the three-dimensional space.
\begin{figure}
\begin{center}
<>=
clPairs(planet.dat,
classification = planet_mclust$classification,
symbols = 1:3, col = "black")
@
\caption{Scatterplot matrix of planets data showing a three-cluster solution
from \Rcmd{Mclust}. \label{CA-planets-mclust-scatter}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
scatterplot3d(log(planets$mass), log(planets$period),
log(planets$eccen + ifelse(planets$eccen == 0,
0.001, 0)),
type = "h", angle = 55, scale.y = 0.7,
pch = planet_mclust$classification,
y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1,
xlab = "log(mass)", ylab = "log(period)",
zlab = "log(eccen)")
@
\caption{3D scatterplot of planets data showing a three-cluster solution
from \Rcmd{Mclust}. \label{CA-planets-mclust-scatterclust}}
\end{center}
\end{figure}
The number of planets in each cluster and the mean vectors of the three clusters
for the untransformed data can now be inspected by using
<>=
table(planet_mclust$classification)
ccent(planet_mclust$classification)
@
Cluster 1 consists of planets about the same size as Jupiter
with very short periods and eccentricities (similar to the first
cluster of the $k$-means solution). Cluster 2 consists of slightly
larger planets with moderate periods and large eccentricities,
and cluster 3 contains the very large planets with very large
periods. These two clusters do not match those found by the $k$-means
approach.
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}