\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}} \newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{\textit{#1}} %%' %%% Math symbols \usepackage{amstext} \usepackage{amsmath} \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\L}{L} \renewcommand{\P}{\mathsf{P}} \newcommand{\K}{\mathbf{K}} \newcommand{\m}{\mathbf{m}} \newcommand{\argmin}{\operatorname{argmin}\displaylimits} \newcommand{\argmax}{\operatorname{argmax}\displaylimits} \newcommand{\bx}{\mathbf{x}} \newcommand{\bbeta}{\mathbf{\beta}} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)}, pdfsubject = {Book}, pdfauthor = {Torsten Hothorn and Brian S. Everitt}, colorlinks = {black}, linkcolor = {black}, citecolor = {black}, urlcolor = {black}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% : conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% \usepackage{longtable} \usepackage[figuresright]{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} \hyphenation{mar-gi-nal} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} %\usepackage{setspace} \definecolor{sidebox_todo}{rgb}{1,1,0.2} \newcommand{\todo}[1]{ \hspace{0pt}% \marginpar{% \fcolorbox{black}{sidebox_todo}{% \parbox{\marginparwidth} { \raggedright\sffamily\footnotesize{TODO: #1}% } }% } } \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition} \author{Torsten Hothorn and Brian S. Everitt} \maketitle %%\VignetteIndexEntry{Chapter Cluster Analysis} %%\VignetteDepends{scatterplot3d,mclust,mvtnorm,lattice} \setcounter{chapter}{20} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, show.signif.stars = FALSE, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)), bigleftpar = function() par(mai = par("mai") * c(1, 1.7, 1, 1)))) HSAURpkg <- require("HSAUR3") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3")) rm(HSAURpkg) ### hm, R-2.4.0 --vanilla seems to need this a <- Sys.setlocale("LC_ALL", "C") ### book <- TRUE refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA", "MDS", "CA"), 1:18) ch <- function(x) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~", ch[2], sep = "")) } } if (file.exists("deparse.R")) source("deparse.R") setHook(packageEvent("lattice", "attach"), function(...) { lattice.options(default.theme = function() standard.theme("pdf", color = FALSE)) }) @ \pagestyle{headings} <>= book <- FALSE @ %% lower png resolution for vignettes \SweaveOpts{resolution = 100} <>= library("mclust") library("mvtnorm") mai <- par("mai") options(SweaveHooks = list(rmai = function() { par(mai = mai * c(1,1,1,2))})) data("pottery", package = "HSAUR3") @ \chapter[Cluster Analysis]{Cluster Analysis: Classifying Romano-British Pottery and Exoplanets \label{CA}} \section{Introduction} \section{Cluster Analysis} \section{Analysis Using \R{}} \subsection{Classifying Romano-British Pottery} We start our analysis with computing the dissimilarity matrix containing the Euclidean distance of the chemical measurements on all $\Sexpr{nrow(pottery)}$ pots. The resulting $\Sexpr{nrow(pottery)} \times \Sexpr{nrow(pottery)}$ matrix can be inspected by an \stress{image plot}, here obtained from \index{Image plot} function \Rcmd{levelplot} available in package \Rpackage{lattice} \citep{PKG:lattice, HSAUR:Sarkar2008}. Such a plot associates each cell of the dissimilarity matrix with a color or a gray value. We choose a very dark grey for cells with distance zero (i.e., the diagonal elements of the dissimilarity matrix) and pale values for cells with greater Euclidean distance. Figure~\ref{CA-pottery-distplot} leads to the impression that there are at least three distinct groups with small inter-cluster differences (the dark rectangles) whereas much larger distances can be observed for all other cells. \begin{figure} \begin{center} <>= pottery_dist <- dist(pottery[, colnames(pottery) != "kiln"]) library("lattice") levelplot(as.matrix(pottery_dist), xlab = "Pot Number", ylab = "Pot Number") @ <>= trellis.par.set(standard.theme(color = FALSE)) plot(levelplot(as.matrix(pottery_dist), xlab = "Pot Number", ylab = "Pot Number")) @ \caption{Image plot of the dissimilarity matrix of the \Robject{pottery} data. \label{CA-pottery-distplot}} \end{center} \end{figure} We now construct three series of partitions using single, complete, and average linkage hierarchical clustering as introduced in Subsections~\ref{CA:HC} and \ref{CA:diss}. The function \Rcmd{hclust} performs all three procedures based on the dissimilarity matrix of the data; its \Rcmd{method} argument is used to specify how the distance between two clusters is assessed. The corresponding \Rcmd{plot} method draws a dendrogram; the code and results are given in Figure~\ref{CA-pottery-hclust}. Again, all three dendrograms lead to the impression that three clusters fit the data best (although this judgement is very informal). \begin{figure} \begin{center} <>= pottery_single <- hclust(pottery_dist, method = "single") pottery_complete <- hclust(pottery_dist, method = "complete") pottery_average <- hclust(pottery_dist, method = "average") layout(matrix(1:3, ncol = 3)) plot(pottery_single, main = "Single Linkage", sub = "", xlab = "") plot(pottery_complete, main = "Complete Linkage", sub = "", xlab = "") plot(pottery_average, main = "Average Linkage", sub = "", xlab = "") @ \caption{Hierarchical clustering of \Robject{pottery} data and resulting dendrograms. \label{CA-pottery-hclust}} \end{center} \end{figure} From the \Robject{pottery\_average} object representing the average linkage hierarchical clustering, we derive the three-cluster solution by cutting the dendrogram at a height of four (which, based on the right display in Figure~\ref{CA-pottery-hclust} leads to a partition of the data into three groups). Our interest is now a comparison with the kiln sites at which the pottery was found. <>= pottery_cluster <- cutree(pottery_average, h = 4) xtabs(~ pottery_cluster + kiln, data = pottery) @ The contingency table shows that cluster 1 contains all pots found at kiln site number one, cluster 2 contains all pots from kiln sites number two and three, and cluster three collects the ten pots from kiln sites four and five. In fact, the five kiln sites are from three different regions defined by one, two and three, and four and five, so the clusters actually correspond to pots from three different regions. \subsection{Classifying Exoplanets} \begin{figure} \begin{center} <>= data("planets", package = "HSAUR3") library("scatterplot3d") scatterplot3d(log(planets$mass), log(planets$period), log(planets$eccen + ifelse(planets$eccen == 0, 0.001, 0)), type = "h", angle = 55, pch = 16, y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1, scale.y = 0.7, xlab = "log(mass)", ylab = "log(period)", zlab = "log(eccen)") @ \caption{3D scatterplot of the logarithms of the three variables available for each of the exoplanets. \label{CA-planets-scatter}} \end{center} \end{figure} \begin{figure} \begin{center} <>= rge <- apply(planets, 2, max) - apply(planets, 2, min) planet.dat <- sweep(planets, 2, rge, FUN = "/") n <- nrow(planet.dat) wss <- rep(0, 10) wss[1] <- (n - 1) * sum(apply(planet.dat, 2, var)) for (i in 2:10) wss[i] <- sum(kmeans(planet.dat, centers = i)$withinss) plot(1:10, wss, type = "b", xlab = "Number of groups", ylab = "Within groups sum of squares") @ \caption{Within-cluster sum of squares for different numbers of clusters for the exoplanet data. \label{CA-planets-ss}} \end{center} \end{figure} Sadly Figure~\ref{CA-planets-ss} gives no completely convincing verdict on the number of groups we should consider, but using a little imagination `little elbows' can be spotted at the three and five group solutions. %%' We can find the number of planets in each group using <>= planet_kmeans3 <- kmeans(planet.dat, centers = 3) table(planet_kmeans3$cluster) @ The centers of the clusters for the untransformed data can be computed using a small convenience function <>= ccent <- function(cl) { f <- function(i) colMeans(planets[cl == i,]) x <- sapply(sort(unique(cl)), f) colnames(x) <- sort(unique(cl)) return(x) } @ which, applied to the three-cluster solution obtained by $k$-means gets <>= ccent(planet_kmeans3$cluster) @ @ for the three-cluster solution and, for the five cluster solution using <>= planet_kmeans5 <- kmeans(planet.dat, centers = 5) table(planet_kmeans5$cluster) ccent(planet_kmeans5$cluster) @ \subsection{Model-based Clustering in \R{}} We now proceed to apply model-based clustering to the \Robject{planets} data. \R{} functions for model-based clustering are available in package \Rpackage{mclust} \citep{PKG:mclust,HSAUR:FraleyRaftery2002}. Here we use the \Rcmd{Mclust} function since this selects both the most appropriate model for the data \stress{and} the optimal number of groups based on the values of the BIC computed over several models and a range of values for number of groups. The necessary code is: <>= library("mclust") planet_mclust <- Mclust(planet.dat) @ and we first examine a plot of BIC values using the \R{} code that is displayed on top of Figure~\ref{CA-mclust1}. In this diagram the different plotting symbols refer to different model assumptions about the shape of clusters: \begin{description} \item[EII] spherical, equal volume, \item[VII] spherical, unequal volume, \item[EEI] diagonal, equal volume and shape, \item[VEI] diagonal, varying volume, equal shape, \item[EVI] diagonal, equal volume, varying shape, \item[VVI] diagonal, varying volume and shape, \item[EEE] ellipsoidal, equal volume, shape, and orientation, \item[EEV] ellipsoidal, equal volume and equal shape, \item[VEV] ellipsoidal, equal shape, \item[VVV] ellipsoidal, varying volume, shape, and orientation \end{description} \begin{figure} \begin{center} <>= plot(planet_mclust, planet.dat, what = "BIC", col = "black", ylab = "-BIC", ylim = c(0, 350)) @ \caption{Plot of BIC values for a variety of models and a range of number of clusters. \label{CA-mclust1}} \end{center} \end{figure} The BIC selects model VVI (diagonal varying volume and varying shape) with three clusters as the best solution as can be seen from the \Rcmd{print} output: <>= print(planet_mclust) @ This solution can be shown graphically as a scatterplot matrix. The plot is shown in Figure~\ref{CA-planets-mclust-scatter}. Figure~\ref{CA-planets-mclust-scatterclust} depicts the clustering solution in the three-dimensional space. \begin{figure} \begin{center} <>= clPairs(planet.dat, classification = planet_mclust$classification, symbols = 1:3, col = "black") @ \caption{Scatterplot matrix of planets data showing a three-cluster solution from \Rcmd{Mclust}. \label{CA-planets-mclust-scatter}} \end{center} \end{figure} \begin{figure} \begin{center} <>= scatterplot3d(log(planets$mass), log(planets$period), log(planets$eccen + ifelse(planets$eccen == 0, 0.001, 0)), type = "h", angle = 55, scale.y = 0.7, pch = planet_mclust$classification, y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1, xlab = "log(mass)", ylab = "log(period)", zlab = "log(eccen)") @ \caption{3D scatterplot of planets data showing a three-cluster solution from \Rcmd{Mclust}. \label{CA-planets-mclust-scatterclust}} \end{center} \end{figure} The number of planets in each cluster and the mean vectors of the three clusters for the untransformed data can now be inspected by using <>= table(planet_mclust$classification) ccent(planet_mclust$classification) @ Cluster 1 consists of planets about the same size as Jupiter with very short periods and eccentricities (similar to the first cluster of the $k$-means solution). Cluster 2 consists of slightly larger planets with moderate periods and large eccentricities, and cluster 3 contains the very large planets with very large periods. These two clusters do not match those found by the $k$-means approach. \bibliographystyle{LaTeXBibTeX/refstyle} \bibliography{LaTeXBibTeX/HSAUR} \end{document}