% -*- mode: noweb; ess-noweb-default-code-mode: R-mode; -*-
\documentclass[nojss,noheadings]{jss}
\usepackage{amsmath,amssymb,amsfonts,thumbpdf}

%% correct pdf version for EPM
%\pdfminorversion=4

%% additional commands
\newcommand{\squote}[1]{`{#1}'}
\newcommand{\dquote}[1]{``{#1}''}
\newcommand{\fct}[1]{{\texttt{#1()}}}
\newcommand{\class}[1]{\dquote{\texttt{#1}}}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
%% for internal use
\newcommand{\fixme}[1]{\emph{\marginpar{FIXME} (#1)}}
\newcommand{\readme}[1]{\emph{\marginpar{README} (#1)}}
\definecolor{updatecolor}{rgb}{0,0.4,0}
\newcommand{\update}[1]{\marginpar{UPDATE}{\color{updatecolor}#1}}
\newcommand{\revision}[1]{{\color{updatecolor}#1}}

\author{Hannah Frick\\Universit\"at Innsbruck \And
	Carolin Strobl\\Universit\"at Z\"urich \And
	Achim Zeileis\\Universit\"at Innsbruck}
\Plainauthor{Hannah Frick, Carolin Strobl, Achim Zeileis}


\title{Rasch Mixture Models for DIF Detection:\\A Comparison of Old and New Score Specifications}
\Plaintitle{Rasch Mixture Models for DIF Detection: A Comparison of Old and New Score Specifications}
\Shorttitle{Rasch Mixture Models for DIF Detection}

\Abstract{
  This vignette is a (slightly) modified version of 
  \cite{psychomix:Frick+Strobl+Zeileis:2014}, published in 
  \emph{Educational and Psychological Measurement}.

  Rasch mixture models can be a useful tool when checking the assumption of 
  measurement invariance for a single Rasch model. They provide advantages 
  compared to manifest DIF tests when the DIF groups are only weakly correlated 
  with the manifest covariates available. Unlike in single Rasch models, 
  estimation of Rasch mixture models is sensitive to the specification of the 
  ability distribution even when the conditional maximum likelihood approach is 
  used. It is demonstrated in a simulation study how differences in ability can 
  influence the latent classes of a Rasch mixture model. If the aim is only DIF
  detection, it is not of interest to uncover such ability differences as one is
  only interested in a latent group structure regarding the item difficulties. 
  To avoid any confounding effect of ability differences (or impact), a new 
  score distribution for the Rasch mixture model is introduced here. It ensures
  the estimation of the Rasch mixture model to be independent of the ability 
  distribution and thus restricts the mixture to be sensitive to latent 
  structure in the item difficulties only. Its usefulness is demonstrated in a 
  simulation study and its application is illustrated in a study of verbal 
  aggression.
}

\Keywords{mixed Rasch model, Rasch mixture model, DIF detection, score distribution}

\Address{
  Hannah Frick, Achim Zeileis\\
  Department of Statistics\\
  Faculty of Economics and Statistics\\
  Universit\"at Innsbruck\\
  Universit\"atsstr.~15\\
  6020 Innsbruck, Austria\\
  E-mail: \email{Hannah.Frick@uibk.ac.at}, \email{Achim.Zeileis@R-project.org}\\
  URL: \url{http://eeecon.uibk.ac.at/~frick/}, \url{http://eeecon.uibk.ac.at/~zeileis/}\\

  Carolin Strobl\\
  Department of Psychology\\
  Universit\"at Z\"urich\\
  Binzm\"uhlestr.~14\\
  8050 Z\"urich, Switzerland\\
  E-mail: \email{Carolin.Strobl@psychologie.uzh.ch}\\
  URL: \url{http://www.psychologie.uzh.ch/methoden.html}
}

%% Sweave/vignette information and metadata
%% need no \usepackage{Sweave}
\SweaveOpts{engine = R, eps = FALSE, keep.source = TRUE}
%\VignetteIndexEntry{Rasch Mixture Models for DIF Detection: A Comparison of Old and New Score Specifications}
%\VignetteDepends{graphics, stats, methods, psychomix, lmtest}
%\VignetteKeywords{mixed Rasch model, Rasch mixture model, DIF detection, score distribution}
%\VignettePackage{psychomix}

<<preliminaries, echo=FALSE, results=hide>>=
options(width = 70, prompt = "R> ", continue = "+  ")
library("psychomix")
library("lattice")
suppressWarnings(RNGversion("3.5.0"))
set.seed(1090)
cache <- FALSE
@


%% publication info and copyright notice
\usepackage{fancyhdr}
\setlength{\headheight}{15pt}
\renewcommand{\headrulewidth}{0pt}
\pagestyle{fancy}
\thispagestyle{plain}
\fancyhf{}
\fancyhead[RO,LE]{\thepage}
\fancyhead[CO]{\textit{\normalsize Hannah Frick, Carolin Strobl, Achim Zeileis}}
\fancyhead[CE]{\textit{\normalsize Rasch Mixture Models for DIF Detection}}
\fancyfoot[LO,LE]{\small Copyright {\copyright} 2015 The Author(s)}
\fancypagestyle{plain}{
\fancyhf{}
\fancyfoot[LO,LE]{\small This is a preprint of an article published in \textit{Educational and Psychological Measurement}, \textbf{75}(2), 208--234.
  \doi{10.1177/0013164414536183} \\
  Copyright~{\copyright} 2015 The Author(s). \url{http://epm.sagepub.com/}.
  }
}

\begin{document}

\section{Introduction} \label{sec:intro}

Based on the Rasch model \citep{psychomix:Rasch:1960}, 
\cite{psychomix:Rost:1990} introduced what he called the \dquote{mixed Rasch 
  model}, a combination of a latent class approach and a latent trait approach 
to model qualitative and quantitative ability differences. As suggested by 
\cite{psychomix:Rost:1990}, it can also be used to examine the fit of the Rasch 
model and check for violations of measurement invariance such as differential 
item functioning (DIF). Since the model assumes latent classes for which 
separate Rasch models hold, it can be employed to validate a psychological test 
or questionnaire: if a model with two or more latent classes fits better than a 
model with one latent class, measurement invariance is violated and a single 
Rasch model is not suitable because several latent classes are present in the 
data which require separate Rasch models with separate sets of item 
difficulties. These classes are latent in the sense that they are not determined
by covariates.

As the model assesses a questionnaire -- or instrument as it will be referred to
in the following -- as a whole, it works similar to a global test like the 
likelihood ratio (LR) test 
\citep{psychomix:Andersen:1972, psychomix:Gustafsson:1980}, not an itemwise test
like the Mantel-Haenszel test \citep{psychomix:Holland+Thayer:1988}. Hence, it
is the set of item parameters for all items, which is tested for differences 
between groups rather than each item parameter being tested separately.

The mixed Rasch model -- here called Rasch mixture  model to avoid confusion
with mixed (effects) models and instead highlight its relation to mixture 
models -- has since been extended by \cite{psychomix:Rost+VonDavier:1995} to 
different score distributions and by \cite{psychomix:Rost:1991} and 
\cite{psychomix:vonDavier+Rost:1995} to polytomous responses. The so-called 
\dquote{mixed ordinal Rasch model} is a mixture of partial credit 
models~\citep[PCM,][]{psychomix:Masters:1982} and includes a mixture of rating
scale models~\citep[RSM,][]{psychomix:Andrich:1978} as a special case.

The original dichotomous model as well as its polytomous version have been 
applied in a variety of fields. \cite{psychomix:Zickar+Gibby+Robie:2004} use a 
mixture PCM to detect faking in personality questionnaires, while 
\cite{psychomix:Hong+Min:2007} identify three types/classes of depressed 
behavior by applying a mixture RSM to a self-rating depression scale. Another 
vast field of application are tests in educational measurement. 
\cite{psychomix:Baghaei+Carstensen:2013} identify different reader types from a 
reading comprehension test using a Rasch mixture model. 
\cite{psychomix:Maij-deMeij+Kelderman+vanderFlier:2010} also apply a Rasch 
mixture model to identify latent groups in a vocabulary test. 
\cite{psychomix:Cohen+Bolt:2005} use a Rasch mixture model to detect DIF in a 
mathematics placement test.

Rasch mixture models constitute a legitimate alternative to DIF tests for 
manifest variables such as the LR test or the recently proposed Rasch trees 
\citep{psychomix:Strobl+Kopf+Zeileis:2013}. These methods are usually used to 
test DIF based on observed covariates, whereas 
\cite{psychomix:Maij-deMeij+Kelderman+vanderFlier:2010} show that mixture
models are more suitable to detect DIF if the \dquote{true source of bias} is 
a latent grouping variable. The simulation study by 
\cite{psychomix:Preinerstorfer+Formann:2011} suggests that parameter recovery
works reasonably well for Rasch mixture models. While they did not study in 
detail the influence of DIF effect size or the effect of different ability 
distributions, they deem such differences relevant for practical concern but 
leave it to further research to establish just how strongly they influence 
estimation accuracy.

As the Rasch model is based on two aspects, subject ability and item difficulty,
Rasch mixture models are sensitive not only to differences in the item 
difficulties -- as in DIF -- but also to differences in abilities. Such 
differences in abilities are usually called impact and do not infringe on 
measurement invariance \citep{psychomix:Ackerman:1992}. In practice, when 
developing a psychological test, one often follows two main steps. First, the 
item parameters are estimated, e.g., by means of the conditional maximum 
likelihood (CML) approach, checked for model violations and problematic items 
are possibly excluded or modified. Second, the final set of items is used to 
estimate person abilities. The main advantage of the CML approach is that, for a
single Rasch model, the estimation and check of item difficulties are 
(conditionally) independent of the abilities and their distribution.
Other global assessment methods like the LR test and the Rasch trees are also 
based on the CML approach to achieve such independence. However, in a Rasch
mixture model, the estimation of the item difficulties is not independent of 
the ability distribution, even when employing the CML approach. 
\cite{psychomix:DeMars+Lau:2011} find that a difference in mean ability between
DIF groups affects the estimation of the DIF effect sizes. Similarly, other DIF
detection methods are also affected by impact, e.g., inflated type I error rates
occur in the Mantel-Haenszel and logistic regression procedures if impact is
present \citep{psychomix:Li+Brooks+Johanson:2012, psychomix:DeMars:2010}.

When using a Rasch mixture model for DIF detection, an influence of impact alone
on the mixture is undesirable as the goal is to uncover DIF groups based on 
item difficulties, not impact groups based on abilities. To avoid such 
confounding effects of impact, we propose a new version of the Rasch mixture 
model specifically designed to detect DIF, which allows for the transfer of the
crucial property of CML from a single Rasch model to the mixture: estimation and
testing of item difficulties is independent of the abilities and their 
distribution.

A simulation study is conducted to illustrate how previously suggested versions
and this new version of the Rasch mixture model react to impact, either alone or
in combination with DIF, and how this affects the suitability of the Rasch
mixture model as a DIF detection method.

In the following, we briefly discuss the Rasch model and Rasch mixture models to
explain why the latter are sensitive to the specification of the score 
distribution despite employing a conditional maximum likelihood approach for 
estimation. This Section~\ref{sec:theory} is concluded with our suggested new 
score distribution. We illustrate and discuss the behavior of Rasch mixture 
models with different options for the score distribution in a Monte Carlo study 
in Section~\ref{sec:simulation}. The suggested approach for DIF detection via 
Rasch mixture models is illustrated through an empirical application to a study 
on verbally aggressive behavior in Section~\ref{sec:application}. Concluding 
remarks are provided in Section~\ref{sec:conclusion}.


%\pagebreak

\section{Theory} \label{sec:theory}


\subsection{The Rasch model} \label{sec:Rasch}

The Rasch model, introduced by Georg \cite{psychomix:Rasch:1960}, models the
probability for a binary response~$y_{ij} \in \{0, 1\}$ by subject~$i$ to
item~$j$ as dependent on the subject's ability~$\theta_i$ and the item's
difficulty~$\beta_j$. Assuming independence between items given the subject, the
probability for observing a vector $y_i = (y_{i1}, \dots, y_{im})^\top$ with
responses to all $m$~items by subject~$i$ can be written as
%
\begin{equation}
  P(Y_{i} = y_{i} | \theta_i, \beta) ~=~ \prod_{j=1}^{m}
  \frac{\exp \{y_{ij}(\theta_i - \beta_j)\}}{1 + \exp \{\theta_i - \beta_j\}},
  \label{eq:RM:base}
\end{equation}
%
depending on the subject's ability $\theta_i$ and the vector of all item 
difficulties $\beta = (\beta_1, \dots, \beta_m)^\top$.
Capital letters denote random variables and lower case letters denote their
realizations.

Since joint maximum likelihood (JML) estimation of all abilities and 
difficulties is not consistent for a fixed number of items $m$ 
\citep{psychomix:Molenaar:1995a}, conditional maximum likelihood (CML) 
estimation is employed here. This exploits that the number of correctly scored 
items, the so-called raw score $R_i = \sum_{j = 1}^{m}{Y_{ij}}$, is a sufficient 
statistic for the ability $\theta_i$ \citep{psychomix:Molenaar:1995a}.
Therefore, the answer probability from Equation~\ref{eq:RM:base} can be split
into two factors where the first factor is conditionally independent of 
$\theta_i$:
%
\begin{eqnarray*}
  \label{eq:RM:fac}
  P(Y_{i} = y_{i} | \theta_i, \beta)
    & = & P(Y_i = y_i | r_i, \theta_i, \beta) ~ P(R_i = r_i | \theta_i, \beta) \\
  \label{eq:RM:hg}
    & = & \underbrace{P(Y_i = y_i | r_i, \beta)}_{h(y_i | r_i, \beta)} ~
          \underbrace{P(R_i = r_i | \theta_i, \beta)}_{g(r_i | \theta_i, \beta)}.
\end{eqnarray*} 
%
Due to this separation, consistent estimates of the item parameters $\beta$ can
be obtained by maximizing only the conditional part of the likelihood 
$h(\cdot)$:
% 
\begin{equation}
  h(y_i| r_i, \beta) ~=~ \frac
    {\exp\{- \sum_{j = 1}^{m}{y_{ij} \beta_j}\}}
    {\gamma_{r_i}(\beta)}, \label{eq:RM:h}
\end{equation}
%
with $\gamma_{j}(\cdot)$ denoting the elementary symmetric function of 
order~$j$. The resulting CML estimates $\hat \beta$ are consistent, 
asymptotically normal, and asymptotically efficient 
\citep{psychomix:Molenaar:1995a}.

If not only the conditional likelihood but the full likelihood is of interest
-- as in Rasch mixture models -- then the score distribution $g(\cdot)$ needs
to be specified as well. The approach used by \cite{psychomix:Rost:1990} and
\cite{psychomix:Rost+VonDavier:1995} is to employ some distribution for the
raw scores $r_i$ based on a set of auxiliary parameters $\delta$. Then the
probability density function for $y_i$ can be written as:
%
\begin{equation} \label{eq:RM:f}
f(y_i | \beta, \delta) ~=~ h(y_i | r_i, \beta) ~ g(r_i | \delta).
\end{equation}
%
Based on this density, the following subsections first introduce mixture Rasch 
models in general and then discuss several choices for $g(\cdot)$. CML 
estimation is used throughout for estimating the Rasch model, i.e., the 
conditional likelihood $h(\cdot)$ is always specified by Equation~\ref{eq:RM:h}.


%\pagebreak

\subsection{Rasch mixture models}\label{sec:RMM}

Mixture models are essentially a weighted sum over several components,
i.e., here over several Rasch models. Using the Rasch model density function 
from Equation~\ref{eq:RM:f}, the likelihood $L(\cdot)$ of a Rasch mixture model 
with $K$~components for data from $n$~respondents is given by
%
\begin{eqnarray}
  L(\pi^{(1)}, \dots, \pi^{(K)}, \beta^{(1)}, \dots, \beta^{(K)}, \delta^{(1)}, \dots, \delta^{(K)}) 
  &=& \prod_{i=1}^{n}{\sum_{k=1}^{K}{\pi^{(k)} f(y_i | \beta^{(k)}, \delta^{(k)})}}  \label{eq:RMM:f} \nonumber\\
  &=& \prod_{i=1}^{n}{\sum_{k=1}^{K}{\pi^{(k)} h(y_i | r_i, \beta^{(k)}) ~ g(r_i | \delta^{(k)})}}. \label{eq:RMM:hg} 
\end{eqnarray}
%
where the $(k)$-superscript denotes the component-specific parameters: the 
component weight $\pi^{(k)}$, the component-specific item parameters 
$\beta^{(k)}$, and the component-specific score parameters $\delta^{(k)}$ for 
$k = 1, \dots, K$.

This kind of likelihood can be maximized via the expectation-maximization (EM)
algorithm \citep{psychomix:Dempster+Laird+Rubin:1977} which alternates between
maximizing the component-specific likelihoods for obtaining parameter estimates
and computing expectations for each observations belonging to each cluster.

More formally, given (initial) estimates for the model parameters
$\hat{\pi}^{(k)}$, $\hat{\beta}^{(k)}$, $\hat{\delta}^{(k)}$ for all components
$k = 1, \dots, K$, posterior probabilities of each observation~$i$ belonging to
a component, or latent class,~$k$ are calculated in the E-step. This is simply 
$i$'s relative contribution to component~$k$ compared to the sum of all its 
contributions:
% 
\begin{equation}
  \hat{p}_{ik}
    ~=~ \frac{\hat{\pi}^{(k)} ~ f(y_i | \hat{\beta}^{(k)}, \hat{\delta}^{(k)})}%
       {\sum_{\ell=1}^{K} \hat{\pi}^{(\ell)} ~ f(y_i | \hat{\beta}^{(\ell)}, \hat{\delta}^{(\ell)})}
    ~=~ \frac{\hat{\pi}^{(k)} ~ h(y_i | r_i, \hat{\beta}^{(k)}) ~ g(r_i | \hat{\delta}^{(k)})}%
       {\sum_{\ell=1}^{K} \hat{\pi}^{(\ell)} ~ h(y_i | r_i, \hat{\beta}^{(\ell)}) ~ g(r_i | \hat{\delta}^{(\ell)})} \; . \label{eq:RMM:Estep:hg}
\end{equation}
% 
In the M-step of the algorithm, these posterior probabilities are used as the
weights in a weighted ML estimation of the model parameters. This way, an
observation deemed unlikely to belong to a certain latent class does not 
contribute strongly to its estimation. Estimation can be done separately for 
each latent class. Using CML estimation for the Rasch Model, the estimation of
item and score parameters can again be done separately. For all components 
$k = 1, \dots, K$:
% 
\begin{eqnarray}
  (\hat{\beta}^{(k)}, \hat{\delta}^{(k)}) 
  & = & \argmax_{\beta^{(k)}, \delta^{(k)}} \sum_{i=1}^{n}{\hat{p}_{ik}
    \log f(y_i | \beta^{(k)}, \delta^{(k)})}
  \label{eq:RMM:Mstep} \nonumber\\
  & = & \left\{
    \argmax_{\beta^{(k)}} \sum_{i=1}^{n}{\hat{p}_{ik} \log h(y_i | r_i, \beta^{(k)})};~
    \argmax_{\delta^{(k)}} \sum_{i=1}^{n}{\hat{p}_{ik} \log g(r_i | \delta^{(k)})}
  \right\}. \label{eq:RMM:Mstep:sep}
\end{eqnarray}
%
Estimates of the class probabilities can be obtained from the posterior 
probabilities by averaging:
%
\begin{equation}
  \hat{\pi}^{(k)} = \frac{1}{n} \sum_{i=1}^{n}{\hat{p}_{ik}}. \label{eq:RMM:Mstep:pi}  
\end{equation}
%
The E-step (Equation~\ref{eq:RMM:Estep:hg}) and M-step 
(Equations~\ref{eq:RMM:Mstep:sep} and~\ref{eq:RMM:Mstep:pi}) are iterated until 
convergence, always updating either the weights based on current estimates for 
the model parameters or vice versa.

Note that the above implicitly assumes that the number of latent classes~$K$ is
given or known. However, this is typically not the case in practice and $K$ 
needs to be chosen based on the data. As $K$ is not a model parameter -- 
regularity conditions for the likelihood ratio test are not fulfilled
\citep[][Chapter~6.4]{psychomix:McLachlan+Peel:2000} -- it is often chosen via 
some information criterion that balances goodness of fit
(via the likelihood) with a penalty for the number of model parameters. 
Since the various information criteria differ in their penalty term, the
decision which model is considered ``best'' may depend on the information 
criterion chosen. In the following, the BIC 
\citep[Bayesian information criterion,][]{psychomix:Schwarz:1978} is used, which
\cite{psychomix:Li+Cohen+Kim:2009} found to be a suitable model selection method
for dichotomous mixture item response theory models. Note that this is not a 
formal significance test because one does not control a type I error rate.

\subsection{Score distribution} \label{sec:scores}

In a single Rasch model, the estimation of the item parameters is invariant to
the score distribution because of the separation in Equation~\ref{eq:RM:f}.
In the mixture context, this invariance property holds only \emph{given the 
  weights} in Equation~\ref{eq:RMM:Mstep:sep}. However, these posterior weights
depend on the full Rasch likelihood, including the score distribution 
(Equation~\ref{eq:RMM:Estep:hg}). Therefore, the estimation of the item 
parameters in a Rasch mixture model is \emph{not} independent of the score 
distribution for $K > 1$, even if the CML approach is employed. Hence, it is 
important to consider the specification of the score distribution when 
estimating Rasch mixture models and to assess the consequences of potential 
misspecifications.

\subsubsection{Saturated and mean-variance specification}

In his introduction of the Rasch mixture model, \cite{psychomix:Rost:1990}
suggests a discrete probability distribution on the scores with a separate
parameter for each possible score. This requires $m - 2$ parameters per latent 
class as the probabilities need to sum to $1$ (and the extreme scores, $r = 0$ 
and $r = m$, do not contribute to the likelihood).

Realizing that this saturated specification requires a potentially rather large
number of parameters, \cite{psychomix:Rost+VonDavier:1995} suggest a parametric
distribution with one parameter each for mean and variance.

Details on both specifications can be found in \cite{psychomix:Rost:1990} and 
\cite{psychomix:Rost+VonDavier:1995}, respectively. Here, the notation of 
\cite{psychomix:Frick+Strobl+Leisch:2012} is adopted, which expresses both 
specifications in a unified way through a conditional logit model for the 
score $r = 1, \dots, m-1$:
% 
\begin{equation*}
  \label{eq:scores:condLogit}
  g(r | \delta^{(k)}) ~=~
      \frac{\exp\{ z_{r}^\top \delta^{(k)} \}}
           {\sum_{j=1}^{m-1}{\exp\{z_{j}^\top \delta^{(k)} \}}},
\end{equation*}
%
with different choices for $z_r$ leading to the saturated and mean-variance
specification, respectively. For the former, the regressor vector is 
$(m-2)$-dimensional with
%
\begin{equation*}
  \label{eq:scores:saturated}
  z_{r} = (0, \ldots, 0, 1, 0, \ldots, 0)^\top
\end{equation*}
%
and the 1 at position $r - 1$. Consequently, if $r = 1$, $z_{r}$ is a vector
of zeros. For the mean-variance specification, the regressor vector is
$2$-dimensional and given by
%
\begin{equation*}
  \label{eq:scores:meanvar}
  z_{r} = \left( \frac{r}{m}, \frac{4 r (m - r)}{m^2} \right)^\top.
\end{equation*}

\subsubsection{Restricted specification}

In the following we suggest a new specification of the score distribution in 
the Rasch mixture model, which aims at obtaining independence of the item 
parameter estimates from the specification of the score distribution 
and therefore enabling the Rasch mixture model to distinguish between DIF and 
impact. Other global DIF detection methods like the LR test and Rasch trees are
able to make this distinction 
\citep{psychomix:Ankenmann+Witt+Dunbar:1999, psychomix:Strobl+Kopf+Zeileis:2013}
because they are based only on the conditional part of the likelihood
(Equation~\ref{eq:RM:h}). Analogously, we suggest a mixture of only this 
conditional part rather than the full likelihood (Equation~\ref{eq:RM:f}) of the
Rasch model so that the mixture model will only be influenced by differences in
the item parameters.

Mixing only the conditional likelihood $h(\cdot)$ means that the sum over the 
$K$ latent classes in the likelihood of the Rasch mixture model in 
Equation~\ref{eq:RMM:hg} only applies to $h(\cdot)$ but not to the score 
distribution $g(\cdot)$. The mixture is then only based on latent structure in
the item difficulties, not on latent structure in both difficulties and scores. 
Moreover, such a Rasch mixture model based only on the conditional likelihood 
without any score distribution is equivalent to a Rasch mixture model where the
score distribution is independent of the latent class $k = 1, \ldots, K$:

%
\begin{equation*}
  \label{eq:scores:restricted}
  g(r | \delta^{(k)}) = g(r | \delta) \qquad (k = 1, \dots, K),
\end{equation*}
%
because then the factor $g(r|\delta)$ is a constant that can be moved out of 
the sum over the components $k$ in Equation~\ref{eq:RMM:hg}. Consequently, 
compared to the case without any score distribution, the log-likelihood just
changes by an additional constant without component-specific parameters. In
either case, the estimation of the component-specific parameters item parameters
as well as the selection of the number of components $K$ is independent of the
specification of the score distribution.

This equivalence and independence from the score distribution can also be seen 
easily from the definition of the posterior weights 
(Equation~\ref{eq:RMM:Estep:hg}): If restricted, $g(\cdot)$ can be moved out of
the sum and then cancels out, preserving only the dependence on $h(\cdot)$. 
Thus, the $\hat p_{ik}$ depend only on $\hat \pi^{(k)}$ and $\hat \beta^{(k)}$ but 
not $\hat \delta^{(k)}$. Therefore, the component weights and component-specific 
item parameters can be estimated without any specification of the score 
distribution.

Subsequently, we adopt the restricted perspective rather than omitting 
$g(\cdot)$ completely, when we want to obtain a mixture model where the mixture 
is independent of the score distribution. From a statistical point of view this 
facilitates comparisons of the restricted Rasch mixture model with the 
corresponding unrestricted counterpart.


\subsubsection{Overview}

The different specifications of the score distribution vary in their properties
and implications for the whole Rasch mixture model.
\begin{itemize}
  \item The saturated model is very flexible. It can model any shape and is thus
    never misspecified. However, it needs a potentially large number of
    parameters which can be challenging in model estimation and selection.

  \item The mean-variance specification of the score model is more parsimonious
    as it only requires two parameters per latent class. While this is
    convenient for model fit and selection, it also comes at a cost: since it
    can only model unimodal or U-shaped distributions 
    \citep[see][]{psychomix:Rost+VonDavier:1995}, it is partially misspecified 
    if the score distribution is actually multimodal.

  \item A restricted score model is even more parsimonious. Therefore, the same
    advantages in model fit and selection apply. Furthermore, it is invariant to
    the latent structure in the score distribution. If a Rasch mixture model is 
    used for DIF detection, this is favorable as only differences in the item
    difficulties influence the mixture. However, it is partially misspecified if
    the latent structure in the scores and item difficulties coincides.
\end{itemize}


\section{Monte Carlo study} \label{sec:simulation}

The simple question \emph{\dquote{DIF or no DIF?}} leads to the question 
whether the Rasch mixture model is suitable as a tool to detect such violations 
of measurement invariance. 

As the score distribution influences the estimation 
of the Rasch mixture model in general, it is of particular interest how it 
influences the estimation of the number of latent classes, the measure used to 
determine Rasch scalability.

\subsection{Motivational example}

As a motivation for the simulation design, consider the following example: The 
instrument is a knowledge test which is administered to students from two
different types of schools and who have been prepared by one of two different 
courses for the knowledge test. Either of the two groupings might be the source
of DIF (or impact). If the groupings are available as covariates to the item
responses of the students, then a test for DIF between either school types or
course types can be easily carried out using the LR test. However, if the
groupings are not available (or even observed) as covariates, then a DIF test is
still possible by means of the Rasch mixture model. The performance of such a 
DIF assessment is investigated in our simulation study for different effects of
school and course type, respectively.

In the following we assume that the school type is linked to ability difference 
(i.e., impact but not DIF) while the course type is the source of DIF (but not
impact). This can be motivated in the following way (see also 
Figure~\ref{fig:simD:motiv}): When the students from the two school types 
differ in their mean ability, this is impact between these two groups. The 
courses might be a standard course and a new specialized course. While the 
standard course covers all topics of the test equally, the specialized course 
gives more emphasis to a relatively advanced topic and due to time constraints 
less emphasis to a relatively basic topic. This may lead to DIF between the 
students in the standard and the specialized course. See the left panel of 
Figure~\ref{fig:simD:DIFhomo} for illustrative item profiles of the standard 
course (in dark gray) and the specialized course (in light gray).

Finally, the ability groups by school and the DIF groups by course can either 
coincide or not. If all students in the first school type are being taught the 
standard course while all students in the second school type are being taught 
the specialized course, the DIF groups \emph{coincide} with the ability groups.
The DIF and ability groups do \emph{not coincide} but only overlap partly if
both course types are taught in both school types: each DIF group (based on the
type of course taught) consists of a mix of students from both schools and
therefore from both ability groups. An illustration of coinciding and not
coinciding ability and DIF groups is provided in the upper and lower row of
Figure~\ref{fig:simD:motiv}, respectively. Ability groups, based on school type,
are shown in the columns, while DIF groups, based on course type, are 
illustrated with dark and light gray for the standard course and specialized 
course, respectively. This difference of coinciding or not coinciding DIF and 
ability groups might have an influence on the Rasch mixture model's ability to
detect the DIF because in the former case the score distributions differ between
the two DIF groups while in the latter case they do not.

Subsequently, a Monte Carlo study is carried out to investigate how the Rasch 
mixture model performs in situations where such groupings are present in the
underlying data-generating process but are not available as observed covariates.
Moreover, we vary whether or not all students come from the same school type
(i.e., from the same ability distribution), whether or not all students receive
the standard course (i.e., whether there is DIF), and whether both school types
use the same or different courses (i.e., whether the groupings coincide or not).
For all computations, the \proglang{R} system for statistical computing 
\citep{psychomix:R:3.0.2} is used along with the add-on packages \pkg{psychomix}
\citep{psychomix:Frick+Strobl+Leisch:2012} and \pkg{clv} 
\citep{psychomix:Nieweglowski:2013}.


\setkeys{Gin}{width = 0.8\textwidth}
\begin{figure}[t!]
  \centering
<<fig-simD-motiv, echo = FALSE, fig = TRUE, width = 7, height = 2>>=
mygrays <- gray.colors(2)
par(mar = c(0.1, 6, 3, 0.1), las = 1)
plot(0, 0, xlim = c(-0.2, 3), ylim = c(0.2, 1.8), type = "n", axes = FALSE, xlab = "", ylab = "")
points(rep(c(0 + 0:3/5.5, 1 + 0:3/5.5), 2), rep(c(1.5, 0.5), each = 8), pch = 21,
  bg = mygrays[c(rep(1, 4), rep(2, 4), 1, 2, 1, 2, 2, 2, 1, 1)], cex = 2)
axis(2, at = c(1.5, 0.5), c("Coinciding", "Not coinciding"), lwd = 0, pos = -0.2, line = 0)
axis(3, at = c(0, 1), c("School type I\n(low ability)", "School type II\n(high ability)"),
  lwd = 0, hadj = 0)
legend(2, 1.7, c("standard", "specialized"), title = "Course type\n(source of DIF)",
  pch = 21, pt.bg = mygrays, bty = "n", title.adj = 0)
@   
  \caption{Grouping structure in the motivational example.}
  \label{fig:simD:motiv}
\end{figure}

\subsection{Simulation design} \label{sec:design}

<<simPrep, echo = FALSE>>=
## function to generate a design-list for simRaschmix()
generateDesign <- function(nobs = 500, m = 20, weights = NULL,
                           ab = 0, ab.dist = c("fix", "normal"),
                           dif = 2, beta = 1.9, index = 5, coincide = TRUE){
  ## weights
  if (is.null(weights)) weights <- rep(0.25, 4)
  
  ## coincide
  ## can only be set to FALSE if there are differences in both abilities and items
  ## = if either is 0, it has to be TRUE
  if (any(isTRUE(all.equal(c(ab, dif), 0)))) coincide <- TRUE
  
  ## ability
  ab.dist <- match.arg(ab.dist, c("fix", "normal"))
  ab <- c(-ab, ab)
  ab <- if (coincide) rep(ab, each = 2) else rep(ab, times = 2)
  
  ability <- if(ab.dist == "fix"){
    array(c(rbind(ab, 1)), dim = c(1,2,4)) # 1 = rel.frequency in sample()
  } else {
    rbind(ab, 0.3) ## 0.3 = sd for rnorm()
  }

  ## difficulty
  beta <- beta2 <- seq(from = -beta, to = beta, length = m)
  for (i in index){
    beta2[i] <- beta2[i] + dif
    beta2[m-i+1] <- beta2[m-i+1] - dif
  }
  difficulty <- cbind(beta, beta, beta2, beta2)
  
  return(list(nobs = nobs, weights = weights, ability = ability,
              difficulty = difficulty))
  ## NOTE: simRaschmix will return a cluster attribute with 4 different values/classes.
}

stacked_bars <- function(rs, cl = NULL, max = NULL, col = NULL, ...)
{
   if(is.null(max)) max <- max(rs)
   rs <- factor(rs, levels = 0:max)
   if(is.null(cl)) {
     tab <- table(rs)
     names(tab)[diff(-1:max %/% 5) < 1] <- ""
     if(is.null(col)) col <- gray.colors(2)[2]
   } else {
     tab <- table(cl, rs)
     colnames(tab)[diff(-1:max %/% 5) < 1] <- ""
     if(is.null(col)) col <- gray.colors(nrow(tab))
   }
   tab <- prop.table(tab)
   bp <- barplot(tab, space = 0, ...)
}

## colors
mygrays <- gray.colors(2)
myhcl <- psychomix:::qualitative_hcl(3)

## load simulated data
load("scoresim.rda")
scoresim$prop23 <- 1 - scoresim$prop1
@ 

The simulation design combines ideas from the motivational example with aspects
from the simulation study conducted by \cite{psychomix:Rost:1990}. Similar to 
the original simulation study, the item parameters represent an instrument with
increasingly difficult items. Here, 20~items are employed with corresponding 
item parameters $\beta^\mathit{I}$ which follow a sequence from $-1.9$ to 1.9 
with increments of 0.2 and hence sum to zero. 
%
\begin{eqnarray*}
  \beta^\mathit{I} &=& (-1.9, -1.7, \ldots, 1.7, 1.9)^{\top}  \label{eq:simD:betaI}\\
  \beta^\mathit{II} &=& (-1.9, -1.7, \ldots, -1.1 + \Delta, \ldots, 1.1 - \Delta, \ldots, 1.7, 1.9)^{\top}  \label{eq:simD:betaII}
\end{eqnarray*}
%
To introduce DIF, a second set of item parameters $\beta^\mathit{II}$ is 
considered where items 5 and 16 are changed by $\pm \Delta$. This approach is 
similar in spirit to that of \cite{psychomix:Rost:1990} -- who reverses the full
sequence of item parameters to generate DIF -- but allows for gradually changing
from small to large DIF effect sizes. 
Subject abilities are drawn with equal weights from two normal distributions 
with means $-\Theta/2$ and $+\Theta/2$ and standard deviation 0.3, thus creating
a sample with two groups of subjects: one group with a lower mean ability and
one with a higher mean ability. 

In the simulations below, the DIF effect size $\Delta$ ranges from 0 to 4 in 
steps of 0.2 
%
\begin{equation*}
  \label{eq:simD:Delta}
  \Delta ~\in~ \{0, 0.2, \ldots, 4\}
\end{equation*}
%
while the impact $\Theta$ covers the same range in steps of 0.4:
%
\begin{equation*}
  \label{eq:simD:Theta}
  \Theta ~\in~ \{0, 0.4, \ldots, 4\}.
\end{equation*}
%

%
\begin{table}[t!]
  \small
  \centering
  \begin{tabular}{rlcccc}
    \hline\noalign{\smallskip}
    \multicolumn{2}{l}{Scenario} & \multicolumn{2}{c}{Latent class~I} &  \multicolumn{2}{c}{Latent class~II} \\
    & & Mean abilities & Difficulties & Mean abilities & Difficulties \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    \multicolumn{6}{l}{\em No impact $(\Theta = 0)$}\\ \noalign{\smallskip}
    1 & no DIF $(\Delta = 0)$ & $\{ 0 \}$ & $\beta^\mathit{I}$ & --- & --- \\
    2 & DIF $(\Delta > 0)$ & $\{ 0 \}$ & $\beta^\mathit{I}$ & $\{ 0 \}$ & $\beta^\mathit{II}$ \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    \multicolumn{6}{l}{\em Impact $(\Theta > 0)$}\\ \noalign{\smallskip}
    3 & no DIF $(\Delta = 0)$ & $\{ -\Theta/2, +\Theta/2 \}$ & $\beta^\mathit{I}$ & --- & --- \\
    4 & DIF $(\Delta > 0)$, not coinciding & $\{ -\Theta/2, +\Theta/2 \}$ & $\beta^\mathit{I}$ & $\{ -\Theta/2, +\Theta/2 \}$ & $\beta^\mathit{II}$ \\
    5 & DIF $(\Delta > 0)$, coinciding & $\{ -\Theta/2 \}$ & $\beta^\mathit{I}$ & $\{ +\Theta/2 \}$ & $\beta^\mathit{II}$ \\
   \noalign{\smallskip}\hline
  \end{tabular}
  \caption{Simulation design. The latent-class-specific item parameters $\beta^\mathit{I}$ and 
    $\beta^\mathit{II}$ differ by $\Delta$ for two elements and thus coincide for $\Delta = 0$, 
    leaving only a single latent class.
  \label{tab:simD:design}}
\end{table}
%

\setkeys{Gin}{width = \textwidth}
\begin{figure}[p!]
\centering
%
<<fig-simD-DIFhomo, echo = FALSE, fig = TRUE, width = 8, height = 2.9>>=
## frame
par(mfrow = c(1,2))
par(mar = c(2, 4, 2, 2) + 0.1) # c(bottom, left, top, right)

## only DIF
des <- generateDesign(ab = 0, dif = 2, ab.dist = "normal")
set.seed(1)
dat <- simRaschmix(des)
rs <- rowSums(dat)
cl <- factor(as.numeric(attr(dat, "cluster") > 2) + 1)
ip <- attr(dat, "difficulty")[,2:3]

plot(ip[,1], type = "n", ylab = "Item difficulty", xlab = "")
points(ip[,2], type = "o", pch = 21, col = 1, bg = mygrays[2], lty = 2)
points(ip[,1], pch = 20, col = mygrays[1])

stacked_bars(rs, cl, max = 20, ylab = "Score frequencies")
@ 
%
\caption{Scenario 2. Left: Item difficulties with DIF ($\Delta = 2$). Right: Stacked histogram of unimodal score distribution with homogeneous abilities ($\Theta = 0$).
\label{fig:simD:DIFhomo}}
\end{figure}


\begin{figure}[p!]
\centering
% 
<<fig-simD-noDIFhet, echo = FALSE, fig = TRUE, width = 8, height = 2.9>>=
## frame
par(mfrow = c(1, 2))
par(mar = c(2, 4, 2, 2) + 0.1) 

## only heterogeneous abilities
des <- generateDesign(ab = 1, dif = 0, ab.dist = "normal") ## ab = 1 --> impact = 2
set.seed(1)
dat <- simRaschmix(des)
rs <- rowSums(dat)
cl <- factor(as.numeric(attr(dat, "cluster") > 2) + 1)
ip <- attr(dat, "difficulty")[,2:3]

plot(ip[,1], type = "b", pch = 21, bg = mygrays[2], lty = 2,  ylab = "Item difficulty", xlab = "")

stacked_bars(rs, cl = NULL, max = 20, ylab = "Score frequencies", xlab = "")

par(mfrow = c(1, 1))
@ 
%
\caption{Scenario 3. Left: Item difficulties without DIF ($\Delta = 0$). Right: Histogram of bimodal score distribution with impact ($\Theta = 2$).
\label{fig:simD:noDIFhet}}
\end{figure}


\begin{figure}[p!]
\centering
% 
<<fig-simD-DIFhet, echo = FALSE, fig = TRUE, width = 8, height = 2.9>>=
## frame
par(mfrow = c(1,2))
par(mar = c(2, 4, 2, 2) + 0.1)

## designs:
## heterogeneous abilities within DIF groups
des <- generateDesign(ab = 1.0, dif = 2, coincide = FALSE, ab.dist = "normal") ## ab = 1 --> impact = 2
set.seed(1)
dat <- simRaschmix(des)
rs <- rowSums(dat)
cl <- factor(as.numeric(attr(dat, "cluster") > 2) + 1)
ip <- attr(dat, "difficulty")[,2:3]

stacked_bars(rs, cl, max = 20, ylab = "Score frequencies", xlab = "")

## heterogeneous abilities between DIF groups
des <- generateDesign(ab = 1.0, dif = 2, coincide = TRUE, ab.dist = "normal") ## ab = 1 --> impact = 2
set.seed(1)
dat <- simRaschmix(des)
rs <- rowSums(dat)
cl <- factor(as.numeric(attr(dat, "cluster") > 2) + 1)
ip <- attr(dat, "difficulty")[,2:3]

stacked_bars(rs, cl, max = 20, ylab = "Score frequencies", xlab = "")

par(mfrow = c(1,1))
@ 
%
\caption{Stacked histograms of score distributions for Scenarios~4 (left) and~5
  (right) with DIF ($\Delta = 2$). Left: impact and DIF, not coinciding 
  ($\Theta = 2$). Right: impact and DIF, coinciding ($\Theta = 2$). For item 
  difficulties see Figure~\ref{fig:simD:DIFhomo} (left).
  \label{fig:simD:DIFhet}}
\end{figure}

Impact and DIF, or lack thereof, can be combined in several ways. 
Table~\ref{tab:simD:design} provides an overview and 
Figures~\ref{fig:simD:DIFhomo}, \ref{fig:simD:noDIFhet}, 
and~\ref{fig:simD:DIFhet} show illustrations. In the following, the different 
combinations of impact and DIF are explained in more detail and connected to the
motivational example:
%
\begin{itemize}
  \item If the simulation parameter~$\Delta$ for the DIF effect size is set to 
    zero, both sets of item parameters, $\beta^\mathit{I}$ and $\beta^\mathit{II}$,
    are identical and no DIF is present. Since CML is employed, model selection
    and parameter estimation is typically expected to be independent of whether
    or not an impact is present (Scenario~1 and~3 in 
    Table~\ref{tab:simD:design}). 
    
    In the example: Only the standard course is taught and hence no DIF exists.
    
  \item If $\Delta > 0$, the item parameter set $\beta^\mathit{II}$ is different
    from $\beta^\mathit{I}$. Hence, there is DIF and two latent classes exist 
    (Scenarios~2, 4, and~5). Both classes are chosen to be of equal size in this
    case. For an illustration see the left panel of
    Figure~\ref{fig:simD:DIFhomo}. 
    
    In the example: Both courses are taught, thus leading to DIF. The standard 
    course corresponds to the straight line as the item profile while the
    specialized course corresponds to the spiked item profile with relatively
    difficult item~16 being easier and the relatively easy item~5 being more
    difficult for students in this specialized course than for students in the
    standard course.

  \item If the simulation parameter $\Theta$ for the impact is set to 
    zero~(Scenarios~1 and~2), then the resulting score distribution is unimodal.
    For an illustration of such a unimodal score distribution see the right 
    panel of Figure~\ref{fig:simD:DIFhomo}. This histogram illustrates 
    specifically Scenario~2 where no impact is present but DIF exists. The
    histogram is shaded in light and dark gray for the two DIF groups and thus 
    to be read like a \dquote{stacked histogram}.
    
    In the example: All students are from the same school and hence there is no 
    impact. However, both types of courses may be taught in this one school, 
    thus leading to DIF as in Scenario~2.
      
  \item If $\Theta > 0$, subject abilities are sampled with equal weights from 
    two normal distributions with means $\{-\Theta/2, +\Theta/2\}$, thus 
    generating impact. When no DIF is included (Scenario~3), the resulting score
    distribution moves from being unimodal to being bimodal with increasing
    $\Theta$. The two modi of high and low scores represent the two groups of 
    subjects with high and low mean abilities, respectively. However, only a 
    medium gray is used to shade the illustrating histogram in 
    Figure~\ref{fig:simD:noDIFhet} as \emph{no DIF groups} are present.
    
    In the example: Only the standard course is taught in both school types.
    Hence no DIF is present but impact between the school types.
    
  \item If there is DIF~(i.e., $\Delta > 0$) in addition to impact~(i.e., 
    $\Theta > 0$), subjects can be grouped both according to mean ability (high 
    vs.\ low) and difficulty (straight vs.\ spiked profile in $\beta^{I}$ and
    $\beta^{II}$, respectively).
    
    These groups can \emph{coincide}: For subjects with low mean 
    ability~$-\Theta/2$, item difficulties~$\beta^{I}$ hold, while for subjects
    with high mean ability~$+\Theta/2$, item difficulties~$\beta^{II}$ hold. This
    is simulated in Scenario~5 and labeled \emph{Impact and DIF, coinciding}. 
    The resulting score distribution is illustrated in the right panel of 
    Figure~\ref{fig:simD:DIFhet}. Subjects for whom item 
    difficulties~$\beta^{I}$ hold are shaded in dark gray and as they also have
    lower mean abilities, their scores are all relatively low. Conversely, 
    subjects for whom item difficulties~$\beta^{II}$ hold are shaded in light
    gray and as they also have higher mean abilities, their scores are all 
    relatively high.
    
    Additionally, the DIF groups and ability groups can also 
    \emph{not coincide}: Subjects in either DIF group may stem from both ability
    groups, not just one. This is simulated in Scenario~4 and labeled
    \emph{Impact and DIF, not coinciding}. The resulting score distribution is
    illustrated in the left panel of Figure~\ref{fig:simD:DIFhet}. Again, 
    subjects for whom item difficulties~$\beta^{I}$ and $\beta^{II}$ hold are 
    shaded in dark and light gray, respectively. As subjects stem from both
    ability groups (high vs.\ low abilities), both score distributions are 
    bimodal.
      
    In the example: Students from both school types and from both course types
    are considered, thus leading to both impact and DIF. Either both courses are
    taught at both schools (Scenario~4, not coinciding) or the standard course
    is only taught in the first school and the specialized course is only taught
    at the second school (Scenario~5, coinciding).
    
\end{itemize}
%
Note that Scenario~1 is a special case of Scenario~2 where $\Delta$ is 
reduced to zero as well as a special case of Scenario~3 where $\Theta$ is 
reduced to zero. Therefore, in the following, Scenario~1 is not inspected 
separately but included in both the setting of
\emph{No impact with DIF} (Scenario~2) and the setting of
\emph{Impact without DIF} (Scenario~3) as a reference point. 
Similarly, Scenarios~4 and~5 both can be reduced to Scenario~3 if $\Delta$ is 
set to zero. It is therefore also included in both the setting of
\emph{Impact and DIF, not coinciding} (Scenario~4) and the setting of
\emph{Impact and DIF, coinciding} (Scenario~5) as a reference point.

For each considered combination of $\Delta$ and $\Theta$, 500 datasets of 500 
observations each are generated. Larger numbers of datasets or observations lead
to very similar results. Observations with raw scores of 0 or $m$ are removed
from the dataset as they do not contribute to the estimation of the Rasch
mixture model \citep{psychomix:Rost:1990}. For each dataset, Rasch mixture 
models for each of the saturated, mean-variance, and restricted score 
specifications are fitted for $K = 1, 2, 3$.

%\pagebreak

\subsection{False alarm rate and hit rate} \label{sec:DIFdetection}

The main objective here is to determine how suitable a Rasch mixture model, 
with various choices for the score model, is to recognize DIF or the lack 
thereof. 

For each dataset and type of score model, models with $K = 1, 2, 3$ latent 
classes are fitted and the $\hat K$ associated with the minimum BIC is selected.
Choosing one latent class ($\hat K = 1$) then corresponds to assuming 
measurement invariance while choosing more than one latent class ($\hat K > 1$)
corresponds to assuming violations of measurement invariance. While Rasch
mixture models do not constitute a formal significance test, the empirical 
proportion among the 500~datasets with $\hat K > 1$ corresponds in essence to
the power of DIF detection if $\Delta > 0$ (and thus two true latent classes 
exist) and to the associated type I error of a corresponding test if 
$\Delta = 0$ (and thus only one true latent class exists). If the rate
corresponds to power, it will be referred to as \emph{hit rate} whereas if it 
corresponds to a type I error it will be referred to as \emph{false alarm rate}.

In the following subsections, the key results of the simulation study 
will be visualized. The exact rates for all conditions are included as a dataset
in the \proglang{R} package \pkg{psychomix}, for details see the section on 
computational details.

\subsubsection{Scenario 2: No impact with DIF}

This scenario is investigated as a case of DIF that should be fairly 
simple to detect. There is no impact as abilities are homogeneous across all 
subjects so the only latent structure to detect is the group membership based on
the two item profiles. This latent structure is made increasingly easy to detect
by increasing the difference between the item difficulties for both latent 
groups. In the graphical representation of the item parameters (left panel of
Figure~\ref{fig:simD:DIFhomo}) this corresponds to enlarging the spikes in the
item profile. 

Figure~\ref{fig:simR:DIFhomo} shows how the rate of choosing a model with more 
than one latent class~($\hat K > 1$) increases along with the DIF effect 
size~$\Delta$. At~$\Delta = 0$, this is a false alarm rate. It is around $7$\% 
for the saturated model and very close to zero for the mean-variance and the
saturated score model ($< 1$\%). With increasing $\Delta > 0$, the rate is a 
hit rate. For low values of~$\Delta$ the two more parsimonious versions of the
Rasch mixture model (with mean-variance and restricted score distribution) are 
not able to pick up the DIF but at around~$\Delta = 3$ the hit rate for the two
models increases and almost approaches~$1$ at~$\Delta = 4$. Not surprisingly, 
the restricted score specification performs somewhat better because in fact the 
raw score distributions do not differ between the two latent classes. 
The baseline hit rate of the saturated model for low values of~$\Delta$
is the same as the false alarm rate for~$\Delta = 0$. It only increases beyond 
the same threshold~($\Delta = 3$) as the hit rate of the other two models.
However, its rate is much lower compared to the other two score model (only 
around 30\%). The reason is that it requires 18~additional score parameters 
for an additional latent class which is \dquote{too costly} in terms of BIC. 
Hence,~$\hat K = 1$ is chosen for most Rasch mixture models using a saturated
score distribution.

The number of iterations in the EM algorithm which are necessary for the
estimation to converge is much lower for the mean-variance and the restricted 
model than for the saturated model. Since the estimation of the saturated model
is more extensive due to the higher number of parameters required by this model,
it does not converge in about 10\% of the cases before reaching the maximum 
number of iterations which was set to 400. The mean-variance and saturated model
usually converge within the first 200 iterations.

\emph{Brief summary:} The mean-variance and restricted model have higher hit 
rates than the saturated model in the absence of impact.


\setkeys{Gin}{width = 0.65\textwidth}
\begin{figure}[t!]
\centering
%  
<<fig-simR-DIFhomo, echo = FALSE, fig = TRUE, width = 6.5, height = 4.5>>=
par(mar = c(4, 4, 2, 2) + 0.1)

plot(prop23 ~ delta, data = scoresim, subset = theta == 0 & scores == "saturated", 
     ylim = c(0,1), type = "b", 
     xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
     ylab = "Choose more than 1 latent class", col = myhcl[3], lty = 3, pch = 3)
lines(prop23 ~ delta, data = scoresim, subset = theta == 0 & scores == "meanvar", 
     type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(prop23 ~ delta, data = scoresim, subset = theta == 0 & scores == "restricted", 
     type = "b", col = myhcl[1], lty = 2, pch = 6)
legend("topleft", legend = c("saturated", "mean-variance", "restricted"), 
       col = myhcl[3:1], lty = c(3,1,2), pch = c(3,1,6), bty = "n")

par(mar = c(5, 4, 4, 2) + 0.1)
@
%
\caption{Rate of choosing a model with $\hat K > 1$ latent classes for data from
Scenario~2 (DIF without impact, i.e., $\Theta = 0$).
\label{fig:simR:DIFhomo}}
\end{figure}


\subsubsection{Scenario 3: Impact without DIF}

Preferably, a Rasch mixture model should not only detect latent classes if the
assumption of measurement invariance is violated but it should also indicate a
lack of latent structure if indeed the assumption holds. In this scenario, the
subjects all stem from the same class, meaning each item is of the same
difficulty for every subject. However, subject abilities are simulated with 
impact resulting in a bimodal score distribution as illustrated in 
Figure~\ref{fig:simD:noDIFhet}.

Here, the rate of choosing more than one latent class can be interpreted as a 
false alarm rate (Figure~\ref{fig:simR:noDIFhet}). The restricted score model
is invariant against any latent structure in the score distribution and thus 
almost always ($\le$ 0.2\%) suggests $\hat K = 1$ latent class based on the
DIF-free item difficulties. The rate does not approach any specific significance
level as the Rasch mixture model, regardless of the employed score distribution,
is not a formal significance test. The saturated model also picks $\hat K = 1$ 
in most of the simulation. This might be due to its general reluctance to choose
more than one latent class as illustrated in Figure~\ref{fig:simR:DIFhomo} or
the circumstance that it can assume any shape (including bimodal patterns).
However, the mean-variance score distribution can only model unimodal or 
U-shaped distributions as mentioned above. Hence, with increasing impact and
thus increasingly well-separated modes in the score distribution, the Rasch 
mixture model with this score specification suggests $\hat K > 1$ latent 
classes in up to 53\% of the cases. Note, however, that these latent classes do 
not represent the DIF groups (as there are none) but rather groups of subjects 
with high vs.\ low abilities. While this may be acceptable (albeit unnecessarily
complex) from a statistical mixture modeling perspective, it is misleading from 
a psychometric point of view if the aim is DIF detection. Only one Rasch model 
needs to be estimated for this type of data, consistent item parameter estimates
can be obtained via CML and all observations can be scaled in the same way.

\emph{Brief summary:} If measurement invariance holds but ability differences 
are present, the mean-variance model exhibits a high false alarm rate while the
saturated and restricted model are not affected.


\begin{figure}[t!]
\centering
%  
<<fig-simR-noDIFhet, echo = FALSE, fig = TRUE, width = 6.5, height = 4.5>>=
par(mar = c(4, 4, 2, 2) + 0.1)

plot(prop23 ~ theta, data = scoresim, subset = delta == 0 & scores == "saturated", 
     ylim = c(0,1), type = "b", 
     xlab = expression(paste("Impact (", Theta, ")", sep = "")),
     ylab = "Choose more than 1 latent class", col = myhcl[3], lty = 3, pch = 3)
lines(prop23 ~ theta, data = scoresim, subset = delta == 0 & scores == "meanvar", 
     type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(prop23 ~ theta, data = scoresim, subset = delta == 0 & scores == "restricted", 
     type = "b", col = myhcl[1], lty = 2, pch = 6)
legend("topleft", legend = c("saturated", "mean-variance", "restricted"), 
       col = myhcl[3:1], lty = c(3,1,2), pch = c(3,1,6), bty = "n")

par(mar = c(5, 4, 4, 2) + 0.1)
@
%
\caption{Rate of choosing a model with $\hat K > 1$ latent classes for data from
Scenario~3 (impact without DIF, i.e., $\Delta = 0$).
\label{fig:simR:noDIFhet}}
\end{figure}


\subsubsection{Scenario 4: Impact and DIF, not coinciding}

In this scenario, there is DIF (and thus two true latent classes) if 
$\Delta > 0$. Again, Scenario~3 with $\Delta = 0$ (and thus without DIF) is 
included as a reference point. However, unlike in Scenario~2, the abilities 
within the latent classes are not homogeneous but two ability groups exist, 
which do not coincide with the two DIF groups. Nonetheless, the score
distribution is the same across both latent classes (illustrated in the left
panel of Figure~\ref{fig:simD:DIFhet}).

Figure~\ref{fig:simR:DIFhetWithin} again shows the rate of choosing $\hat K > 1$
for increasing DIF effect size $\Delta$ for two levels of impact~($\Theta = 2.4$
and $3.6$), exemplary for medium and high impact. If impact is small 
(e.g.,~$\Theta = 0.4$), the rates are very similar to the case of completely
homogeneous abilities without impact~(Figure~\ref{fig:simR:DIFhomo} 
with~$\Theta = 0$) and thus not visualized here. While the rates for the 
restricted and the saturated score model do not change substantially for an 
increased impact~($\Theta = 2.4$ and~$3.6$), the mean-variance model is
influenced by this change in ability differences. While the hit rate is
increased to around 20\% over the whole range of~$\Delta$, the false alarm rate
at~$\Delta = 0$ is increased to the same extent. Moreover, the hit rate only
increases noticeably beyond the initial false alarm rate at around~$\Delta = 3$,
i.e., the same DIF effect size at which the restricted and mean-variance
specifications have an increasing hit rate given homogeneous abilities without
impact. Thus, given rather high impact~($\Theta = 3.6$) the hit rate is not
driven by the DIF detection but rather the model's tendency to assign subjects 
with high vs.\ low abilities into different groups (as already seen in 
Figure~\ref{fig:simR:noDIFhet}).

\setkeys{Gin}{width = \textwidth}
\begin{figure}[t!]
\centering
%
<<fig-simR-DIFhetWithin, echo = FALSE, fig = TRUE, width = 6.4, height = 3>>=
par(mfrow = c(1, 2))
layout(matrix(c(rep(1,4), rep(2,4)), nrow = 1, byrow = TRUE))

## impact = 2.4
par(mar = c(5, 4, 4, 0.5) + 0.1)
plot(prop23 ~ delta, data = scoresim, 
  subset = theta == 2.4 & scores == "saturated" & (scenario == 4 | delta == 0), 
  main = expression(paste("Impact ", Theta, " = 2.4", sep = "")),
  ylim = c(0,1), type = "b", 
  xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "Choose more than 1 latent class", col = myhcl[3], lty = 3, pch = 3)
lines(prop23 ~ delta, 
  data = scoresim, subset = theta == 2.4 & scores == "meanvar" & (scenario == 4 | delta == 0), 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(prop23 ~ delta, data = scoresim, 
  subset = theta == 2.4 & scores == "restricted" & (scenario == 4 | delta == 0),
  type = "b", col = myhcl[1], lty = 2, pch = 6)
legend("topleft", legend = c("saturated", "mean-variance", "restricted"), 
  col = myhcl[3:1], lty = c(3,1,2), pch = c(3,1,6), bty = "n")

## impact = 3.6
par(mar = c(5, 0.5, 4, 4) + 0.1) # c(bottom, left, top, right)
plot(prop23 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "saturated" & (scenario == 4 | delta == 0), 
  main = expression(paste("Impact ", Theta, " = 3.6", sep = "")),
  ylim = c(0,1), type = "b", axes = FALSE,
  xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "", col = myhcl[3], lty = 3, pch = 3)
box(); axis(1)
lines(prop23 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "meanvar" & (scenario == 4 | delta == 0), 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(prop23 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "restricted" & (scenario == 4 | delta == 0),
  type = "b", col = myhcl[1], lty = 2, pch = 6)

par(mar = c(5, 4, 4, 2) + 0.1, mfrow = c(1,1)) ## restore default
@
%
\caption{Rate of choosing a model with $\hat K > 1$ latent classes for data from
  Scenario~4 (impact and DIF, not coinciding).
  \label{fig:simR:DIFhetWithin}}
\end{figure}

As Rasch mixture models with $K = 1, 2, 3$ classes are considered, selecting
$\hat K > 1$ classes can either mean selecting the correct number of $K = 2$
or overselecting $\hat K = 3$ classes. For the saturated and restricted 
specifications overselection is rare (occurring with rates of less than 9\% or 
less than 1\%, respectively). However, similar to Scenario~3 overselection is 
not rare for the mean-variance specification. Figure~\ref{fig:simR:prop23} 
depicts the rates of selecting $\hat K = 2$ and $\hat K = 3$ classes, 
respectively, for increasing~$\Delta$ at $\Theta = 3.6$. If the chances of 
finding the correct number of classes increase with the DIF effect 
size~$\Delta$, the rate for overselection ($\hat K = 3$) should drop with 
increasing~$\Delta$. For this Scenario~4, denoted with hollow symbols, this rate
stays largely the same (around 25\%) and even slightly increases beyond this
level, starting from around $\Delta = 3$. This illustrates again the pronounced 
tendency of the mean-variance model for overselection in cases of high impact.

\emph{Brief summary:} If impact is simulated within DIF groups, the 
mean-variance model has higher hit rates than the saturated and restricted 
models. However, the latent classes estimated by the mean-variance model are 
mostly based on ability differences if the DIF effect size is low. If the DIF 
effect size is high, the mean-variance model tends to overestimate the number 
of classes.


\setkeys{Gin}{width = 0.65\textwidth}
\begin{figure}[t!]
\centering
%  
<<fig-simR-prop23, echo = FALSE, fig = TRUE, width = 6.5, height = 4.5>>=
par(mar = c(4, 4, 4, 2) + 0.1)

plot(prop3 ~ delta, data = scoresim,
  subset = theta == 3.6 & scores == "meanvar" & (scenario == 4 | delta == 0),
  type = "b", ylim = c(0, 1), col = myhcl[2], pch = 1,
  main = expression(paste("Impact ", Theta, " = 3.6", sep = "")),
  xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "Selection proportions")
lines(prop3 ~ delta, data = scoresim,
  subset = theta == 3.6 & scores == "meanvar" & (scenario == 5 | delta == 0),
  type = "b", col = myhcl[2], pch = 16)

lines(prop2 ~ delta, data = scoresim,
  subset = theta == 3.6 & scores == "meanvar" & (scenario == 4 | delta == 0),
  type = "b", col = myhcl[2], pch = 2)
lines(prop2 ~ delta, data = scoresim,
  subset = theta == 3.6 & scores == "meanvar" & (scenario == 5 | delta == 0),
  type = "b", col = myhcl[2], pch = 17)

legend("topleft", 
       legend = c(expression(paste(hat(K), " = 2 - Sc 4", sep = "")),
           expression(paste(hat(K), " = 3 - Sc 4", sep = "")),
           expression(paste(hat(K), " = 2 - Sc 5", sep = "")),
           expression(paste(hat(K), " = 3 - Sc 5", sep = ""))),
           #"2 - Sc 4", "3 - Sc 4", "2 - Sc 5", "3 - Sc 5"), 
  col = myhcl[2], lty = 1, pch = c(2, 1, 17, 16), bty = "n")

par(mar = c(5, 4, 4, 2) + 0.1)
@
%
\caption{Rates of choosing the correct number of classes ($\hat K = 2$) or 
  overselecting the number of classes ($\hat K = 3$) for the Rasch mixture model
  with mean-variance score specification in Scenarios~4 (hollow, impact within 
  DIF groups) and~5 (solid, impact between DIF groups).}
\label{fig:simR:prop23}
\end{figure}
\setkeys{Gin}{width = \textwidth}

%\clearpage

\subsubsection{Scenario 5: Impact and DIF, coinciding}

In Scenario~5, there is also DIF (i.e., $\Delta > 0$) and impact. However, in 
contrast to Scenario~4 the ability and DIF groups coincide (see the right panel
of Figure~\ref{fig:simD:DIFhet}). Furthermore, Scenario~3 is included also here
as the reference point without DIF ($\Delta = 0$).

Again, small ability differences do not strongly influence the rate of choosing 
more than one latent class (rates for low levels of impact, such 
as~$\Theta = 0.4$, are similar to those for~$\Theta = 0$ as depicted in 
Figure~\ref{fig:simR:DIFhomo}). Recall, both mean-variance and restricted 
specification have comparable hit rates for DIF detection starting from around 
$\Delta = 3$ while the saturated specification has lower hit rates. 

As impact increases (Figure~\ref{fig:simR:DIFhetBetween}), the hit rates of all
models increases as well because the ability differences contain information 
about the DIF groups: separating subjects with low and high abilities also 
separates the two DIF groups (not separating subjects within each DIF group as 
in the previous setting). However, for the mean-variance model these increased 
hit rates are again coupled with a highly increased false alarm rate at 
$\Delta = 0$ of 26\% and 50\% for $\Theta = 2.4$ and $3.6$, respectively. The 
restricted score model, on the other hand, is invariant to latent structure in 
the score distribution and thus performs similarly as in previous DIF scenarios,
suggesting more than one latent class past a certain threshold of DIF intensity,
albeit this threshold being a bit lower than when ability groups and DIF groups
do not coincide (around $\Delta = 2$). The saturated model detects more than one
latent class at a similar rate to the restricted score model for medium or high
impact but its estimation converges more slowly and requires more iterations of
the EM algorithm than the other two score models.

\begin{figure}[t!]
\centering
% 
<<fig-simR-DIFhetBetween, echo = FALSE, fig = TRUE, width = 6.4, height = 3>>=
par(mfrow = c(1, 2))
layout(matrix(c(rep(1,4), rep(2,4)), nrow = 1, byrow = TRUE))

## impact = 2.4
par(mar = c(5, 4, 4, 0.5) + 0.1)
plot(prop23 ~ delta, data = scoresim, 
  subset = theta == "2.4" & scores == "saturated" & (scenario == 5 | delta == 0), 
  main = expression(paste("Impact ", Theta, " = 2.4", sep = "")),
  ylim = c(0,1), type = "b", 
  xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "Choose more than 1 latent class", col = myhcl[3], lty = 3, pch = 3)
lines(prop23 ~ delta, 
  data = scoresim, subset = theta == "2.4" & scores == "meanvar" & (scenario == 5 | delta == 0), 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(prop23 ~ delta, data = scoresim, 
  subset = theta == "2.4" & scores == "restricted" & (scenario == 5 | delta == 0),
  type = "b", col = myhcl[1], lty = 2, pch = 6)
## legend("topleft", legend = c("saturated", "mean-variance", "restricted"), 
##   col = myhcl[3:1], lty = c(3,1,2), pch = c(3,1,6), bty = "n")

## impact = 3.6
par(mar = c(5, 0.5, 4, 4) + 0.1) # c(bottom, left, top, right)
plot(prop23 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "saturated" & (scenario == 5 | delta == 0), 
  main = expression(paste("Impact ", Theta, " = 3.6", sep = "")),
  ylim = c(0,1), type = "b", axes = FALSE,
  xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "", col = myhcl[3], lty = 3, pch = 3)
box(); axis(1)
lines(prop23 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "meanvar" & (scenario == 5 | delta == 0), 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(prop23 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "restricted" & (scenario == 5 | delta == 0),
  type = "b", col = myhcl[1], lty = 2, pch = 6)
legend("bottomright", legend = c("saturated", "mean-variance", "restricted"), 
  col = myhcl[3:1], lty = c(3,1,2), pch = c(3,1,6), bty = "n")

par(mar = c(5, 4, 4, 2) + 0.1, mfrow = c(1,1)) ## restore default
@
%
\caption{Rate of choosing a model with $\hat K > 1$ latent classes for data from
  Scenario~5 (impact and DIF, coinciding).
  \label{fig:simR:DIFhetBetween}}
\end{figure}

Finally, the potential issue of overselection can be considered again.
Figure~\ref{fig:simR:prop23} (solid symbols) shows that this problem disappears
for the mean-variance specification if both DIF effect size $\Delta$ and impact
are large \emph{and} coincide. For the restricted model overselection is again 
very rare throughout (occurring in less than 1\% of all cases) while the 
saturated model overselects in up to 29\% of the datasets.

\emph{Brief summary:} If abilities differ between DIF groups, the mean-variance 
model detects the violation of measurement invariance for smaller DIF effect 
sizes than the saturated and restricted model. While the mean-variance model 
does not overselect the number of components in this scenario, the high hit
rates are connected to a high false alarm rate when no DIF is present but impact
is high. This does not affect the other two score models.


\subsection{Quality of estimation} \label{sec:quality}

Although here the Rasch mixture model is primarily used analogously to a global
DIF test, model assessment goes beyond the question whether or not the correct
number of latent classes is found. Once the number of latent classes is 
established/estimated, it is of interest how well the estimated model fits the
data. Which groups are found? How well are the parameters estimated? In the
context of Rasch mixture models with different score distributions, both of
these aspects depend heavily on the posterior probabilities $\hat p_{ik}$ 
(Equation~\ref{eq:RMM:Estep:hg}) as the estimation of the item parameters 
depends on the score distribution only through these. If the $\hat p_{ik}$ were
the same for all three score specifications, the estimated item difficulties 
were the same as well. Hence, the focus here is on how close the estimated
posterior probabilities are to the true latent classes in the data. If the
similarity between these is high, CML estimation of the item parameters within
the classes will also yield better results for all score models.

This is a standard task in the field of cluster analysis and we adopt the widely
used Rand index \citep{psychomix:Rand:1971} here: Each observation is assigned
to the latent class for which its posterior probability is highest yielding an
estimated classification of the data which is compared to the true
classification. For this comparison, pairs of observations are considered. Each
pair can either be in the same class in both the true and the estimated 
classification, in different classes for both classifications or it can be in
the same class for one but not the other classification. The Rand index is the
proportion of pairs for which both classifications agree. Thus, it can assume
values between 0 and 1, indicating total dissimilarity and similarity, 
respectively.


\setkeys{Gin}{width = \textwidth}
\begin{figure}[t!]
\centering
% 
<<fig-simR-Rand, echo = FALSE, fig = TRUE, width = 6, height = 4.7>>=
par(mar = c(5, 4, 4, 2) + 0.1) # c(bottom, left, top, right)
#par(mfrow = c(2, 2))
layout(matrix(rep(1:4, each = 4), nrow = 2, byrow = TRUE))

## scenario 4
## impact = 0.4
par(mar = c(1.5, 4, 4, 0.5) + 0.1)
plot(rand2 ~ delta, data = scoresim, 
  subset = theta == 0.4 & scores == "saturated" & scenario == 4 & delta > 0, 
  main = expression(paste("Impact ", Theta, " = 0.4", sep = "")),
  ylim = c(0.5,1), type = "b", 
  xlab = "", #expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "Rand index", col = myhcl[3], lty = 3, pch = 3)
lines(rand2 ~ delta, 
  data = scoresim, subset = theta == 0.4 & scores == "meanvar" & scenario == 4 & delta > 0, 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(rand2 ~ delta, data = scoresim, 
  subset = theta == 0.4 & scores == "restricted" & scenario == 4 & delta > 0,
  type = "b", col = myhcl[1], lty = 2, pch = 6)
legend("topleft", legend = c("saturated", "mean-variance", "restricted"), 
  col = myhcl[3:1], lty = c(3,1,2), pch = c(3,1,6), bty = "n")
                                        
## impact = 3.6
par(mar = c(1.5, 0.5, 4, 4) + 0.1)
plot(rand2 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "saturated" & scenario == 4 & delta > 0, 
  main = expression(paste("Impact ", Theta, " = 3.6", sep = "")),
  ylim = c(0.5,1), type = "b", axes = FALSE,
  xlab = "", #expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "", col = myhcl[3], lty = 3, pch = 3)
box(); axis(1)
lines(rand2 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "meanvar" & scenario == 4 & delta > 0, 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(rand2 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "restricted" & scenario == 4 & delta > 0,
  type = "b", col = myhcl[1], lty = 2, pch = 6)
text(4.4, 0.65, "Scenario 4", pos = 4, srt = 90, xpd = TRUE)

## scenario 5
## impact = 0.4
par(mar = c(5, 4, 1, 0.5) + 0.1)
plot(rand2 ~ delta, data = scoresim, 
  subset = theta == 0.4 & scores == "saturated" & scenario == 5 & delta > 0, 
  #main = expression(paste("Impact ", Theta, " = 0.4", sep = "")),
  ylim = c(0.5,1), type = "b", 
  xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "Rand index", col = myhcl[3], lty = 3, pch = 3)
lines(rand2 ~ delta, 
  data = scoresim, subset = theta == 0.4 & scores == "meanvar" & scenario == 5 & delta > 0, 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(rand2 ~ delta, data = scoresim, 
  subset = theta == 0.4 & scores == "restricted" & scenario == 5 & delta > 0,
  type = "b", col = myhcl[1], lty = 2, pch = 6)
#legend("topleft", legend = c("saturated", "mean-variance", "restricted"), 
#  col = myhcl[3:1], lty = c(3,1,2), pch = c(3,1,6), bty = "n")
 
par(mar = c(5, 0.5, 1, 4) + 0.1) # c(bottom, left, top, right)
plot(rand2 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "saturated" & scenario == 5 & delta > 0, 
  #main = expression(paste("Impact ", Theta, " = 3.6", sep = "")),
  ylim = c(0.5,1), type = "b", axes = FALSE,
  xlab = expression(paste("DIF effect size (", Delta, ")", sep = "")),  
  ylab = "", col = myhcl[3], lty = 3, pch = 3)
box(); axis(1)
lines(rand2 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "meanvar" & scenario == 5 & delta > 0, 
  type = "b", col = myhcl[2], lty = 1, pch = 1)
lines(rand2 ~ delta, data = scoresim, 
  subset = theta == 3.6 & scores == "restricted" & scenario == 5 & delta > 0,
  type = "b", col = myhcl[1], lty = 2, pch = 6)
text(4.4, 0.65, "Scenario 5", pos = 4, srt = 90, xpd = TRUE)

par(mar = c(5, 4, 4, 2) + 0.1, mfrow = c(1,1)) ## restore default
@
%
\caption{Average Rand index for models with $K = 2$ latent classes. 
   Top row: Scenario~4 (impact and DIF, not coinciding). Bottom row: Scenario~5 
   (impact and DIF, coinciding).
  \label{fig:simR:Rand}}
\end{figure}

In the following, the Rand index for models with the true number of $K = 2$ 
latent classes in Scenarios~4 and~5 (with DIF) is considered. Thus, the
question of DIF detection (or model selection) is not investigated again but
only the quality of latent class recovery (assuming the number of classes $K$ 
to be known or correctly selected). The top row of Figure~\ref{fig:simR:Rand} 
depicts the average Rand index for data from Scenario~4 (impact and DIF, not 
coinciding). Here, all three score specifications find similarly well matching 
classifications, while the Rand index generally decreases with increasing impact
(left to right panel). In particular, while the mean-variance score model
has problems finding the \emph{correct number} of latent classes in this 
scenario, it only performs slightly worse than the other two specifications in 
determining the \emph{correct classes} if the number were known. Similarly, if
it is provided with the correct number of classes, the saturated model also 
identifies the correct classes equally well compared to the other models -- 
despite its difficulties with convergence for higher DIF effect sizes.

However, in Scenario~5 where the score distribution contains information about
the DIF groups, the three score specifications perform very differently as
the bottom row of Figure~\ref{fig:simR:Rand} shows. Given the correct number of 
classes, the mean-variance model is most suitable to uncover the true latent 
classes, yielding Rand indices close to 1 if both DIF effect size and impact are
large. The saturated specification follows a similar pattern albeit with poorer 
results, reaching values of up to 0.87. However, the classifications obtained 
from the restricted score specification do not match the true groups well in 
this scenario, remaining below 0.52 if impact is high. The reason is that the 
restricted score model is partially misspecified as the score distributions 
differ substantially across DIF groups.


\subsection{Summary and implications for practical use}\label{sec:discussion}

Given various combinations of DIF and ability impact, the score models are 
differently suitable for the two tasks discussed here -- DIF detection and 
estimation of item parameters in subgroups. Starting with a summary of the 
results for DIF detection: 

%
\begin{itemize}
  \item The saturated score model has much lower hit rates than the other two 
    specifications, i.e., violation of measurement invariance remains too often
    undetected. Only if high impact and high DIF effect sizes coincide does the
    saturated model perform similarly well as the restricted model.
  \item The mean-variance model has much higher hit rates. However, if impact is
    present in the abilities, this specification has highly inflated false alarm
    rates. Hence, if the mean-variance model selects more than one latent class
    it is unclear whether this is due to DIF or just varying subject abilities.
    Thus, measurement invariance might still hold even if more than one latent
    class is detected.
  \item The restricted score model also has high hit rates, comparable to the 
    mean-variance model if abilities are rather homogeneous. But unlike the 
    mean-variance specification, its false alarm rate is not distorted by 
    impact. Its performance is not influenced by the ability distribution and 
    detecting more than one latent class reliably indicates DIF, i.e., a 
    violation of measurement invariance.
\end{itemize}
%
Hence, if the Rasch mixture model is employed for assessing measurement 
invariance or detecting DIF, then the restricted score specification appears to 
be most robust. Thus, the selection of the number of latent classes should only
be based on this specification.

\cite{psychomix:DeMars:2010} illustrate how significance tests based on the 
observed (raw) scores in reference and focal groups suffer from inflated type~I
error rates with an increased sample size if impact is present. This does not 
apply to the false alarm rate of Rasch mixture models because not a significance
test but rather model selection via BIC is carried out. The rate of the BIC 
selecting the correct model increases with larger sample size if the true model
is a Rasch mixture model. Since consistent estimates are employed, a larger 
sample size also speeds up convergence, which is particularly desirable for the 
saturated model if the number of latent classes and thus the number of 
parameters is high.
  
Given the correct number of classes, the different score models are all 
similarly suitable to detect the true classification if ability impact does not 
contain any additional information about the DIF groups. However, if ability 
impact is highly correlated with DIF groups in the data and the ability groups 
thus coincide with the DIF groups, this information can be exploited by the 
unrestricted specifications while it distracts the restricted model.

Thus, while the selection of the number of latent classes should be based only
on the restricted score specification, the unrestricted mean-variance and
saturated specifications might still prove useful for estimating the Rasch
mixture model (after $\hat K$ has been selected). 

We therefore recommend a two-step approach for DIF detection via a Rasch 
mixture model. First, the number of latent classes is determined via the 
restricted score model. Second, if furthermore the estimation of the item 
difficulties is of interest, the full selection of score models can the be 
utilized. While the likelihood ratio test is not suitable to test for the number
of latent classes, it can be used to establish the best fitting score model, 
given the number of latent classes. If this approach is applied to the full 
range of score models (saturated and mean-variance, both unrestricted and 
restricted), the nesting structure of the models needs to be kept in mind. 

%\pagebreak

\section{Empirical application: Verbal aggression}\label{sec:application}

We use a dataset on verbal aggression \citep{psychomix:Boeck+Wilson:2004} to
illustrate this two-step approach of first assessing measurement invariance via
a Rasch mixture model with a restricted score distribution and then employing 
all possible score models to find the best fitting estimation of the item 
difficulties.

Participants in this study are presented with one of two potentially frustrating
situations (S1 and S2):
%
\begin{itemize}
  \item S1: A bus fails to stop for me.
  \item S2: I miss a train because a clerk gave me faulty information.
\end{itemize}
%
and a verbally aggressive response (cursing, scolding, shouting). Combining each
situation and response with either \dquote{I want to} or \dquote{I do} leads to
the following 12 items:
%
<<VerbalData, echo=FALSE>>=
data("VerbalAggression", package = "psychotools")
VerbalAggression$resp2 <- VerbalAggression$resp2[, 1:12]
va12 <- subset(VerbalAggression, rowSums(resp2) > 0 & rowSums(resp2) < 12)
items <- colnames(va12$resp2)
@ 
%
%
\begin{table*}[h]
  \centering
  \begin{tabular}{llllll}
    \Sexpr{items[1]} & \Sexpr{items[2]} & \Sexpr{items[3]} & \Sexpr{items[4]} & \Sexpr{items[5]} & \Sexpr{items[6]} \\
    \Sexpr{items[7]} & \Sexpr{items[8]} & \Sexpr{items[9]} & \Sexpr{items[10]} & \Sexpr{items[11]} & \Sexpr{items[12]}
  \end{tabular}
\end{table*}
%

First, we assess measurement invariance with regard to the whole instrument: 
we fit a Rasch mixture model with a restricted score distribution 
for~$K = 1,2,3,4$ and employ the BIC for model selection. Note that the 
restricted versions of the mean-variance and saturated model only differ in 
their log-likelihood by a constant factor and therefore lead to the same model
selection. Results are presented in Table~\ref{tab:selectK}.
%
<<VerbalResFit0, echo = FALSE, eval = FALSE>>=
set.seed(403)
mvR <- raschmix(resp2 ~ 1, data = va12, k = 1:4, scores = "meanvar",
  restricted = TRUE)
@
<<VerbalResFit, echo = FALSE, results = hide>>=
if(cache & file.exists("va12_mvR.rda")){
    load("va12_mvR.rda")
} else {
<<VerbalResFit0>>
    if(cache){
        save(mvR, file = "va12_mvR.rda")
    } else {
        if(file.exists("va12_mvR.rda")) file.remove("va12_mvR.rda")
    }
}
@ 
<<VerbalLC, echo = FALSE>>=
mvR3 <- getModel(mvR, which = "BIC")
clsizes <- table(clusters(mvR3))
@ 
%
%
<<VerbalTableK, echo = FALSE>>=
tabK <- data.frame(model = rep("restricted", 4),
  k = sapply(mvR@models, function(x) x@k),
  df = sapply(mvR@models, function(x) x@df),
  logLik = sapply(mvR@models, function(x) x@logLik),
  bic = sapply(mvR@models, BIC))
@ 

The BIC for a Rasch mixture model with more than one latent class is smaller 
than the BIC for a single Rasch model, thus indicating that measurement 
invariance is violated. The best fitting model has $\hat K = 3$ latent classes.
Given this selection of $K$, we want to gain further insight in the data and 
thus want to establish the best fitting Rasch mixture model with $K = 3$ latent 
classes. Four different models are conceivable: either using a restricted or 
unrestricted score model, and either using a saturated or mean-variance 
specification. The results for all four options are presented in 
Table~\ref{tab:selectScores}. Note that the models with restricted saturated 
score distribution and restricted mean-variance score distribution lead to 
identical item parameter estimates. However, it is still of interest to fit them
separately because each of the restricted specifications is nested within the 
corresponding unrestricted specification. Furthermore, the mean-variance 
distribution is nested within the saturated distribution.
%
<<VerbalScoresFit0, echo = FALSE, eval = FALSE>>=
## fit all possible score models
sat3 <- raschmix(resp2 ~ 1, data = va12, k = 3, scores = "saturated")
satR3 <- raschmix(resp2 ~ 1, data = va12, k = 3, scores = "saturated", 
                  restricted = TRUE)
mv3 <- raschmix(resp2 ~ 1, data = va12, k = 3, scores = "meanvar")
@ 
<<VerbalScoresFit, echo = FALSE, results = hide>>=
if(cache & file.exists("va12_m3.rda")){
    load("va12_m3.rda")
} else {
<<VerbalScoresFit0>>
    if(cache){
        save(sat3, satR3, mv3, file = "va12_m3.rda")
    } else {
        if(file.exists("va12_m3.rda")) file.remove("va12_m3.rda")
    }
}
@ 
%
<<VerbalTableS, echo = FALSE>>=
library("lmtest") ## for p-values in text
tabS <- data.frame(model = c("saturated", "restricted (saturated)", 
    "mean-variance", "restricted (mean-variance)"),
  k = sapply(list(sat3, satR3, mv3, mvR3), function(x) x@k),
  df = sapply(list(sat3, satR3, mv3, mvR3), function(x) x@df),
  logLik = sapply(list(sat3, satR3, mv3, mvR3), function(x) x@logLik),
  bic = sapply(list(sat3, satR3, mv3, mvR3), BIC))
@ 

\begin{table}[t!]
  \centering
  \begin{tabular}{lrrrr}
    \hline
    Model & k & \#Df & $\log L$ & BIC \\
    \hline
    restricted (mean-variance) & \Sexpr{tabK$k[1]} & \Sexpr{tabK$df[1]} & $\Sexpr{round(tabK$logLik[1], 1)}$ & \Sexpr{round(tabK$bic[1], 1)} \\
    restricted (mean-variance) & \Sexpr{tabK$k[2]} & \Sexpr{tabK$df[2]} & $\Sexpr{round(tabK$logLik[2], 1)}$ & \Sexpr{round(tabK$bic[2], 1)} \\
    \textbf{restricted (mean-variance)} & \textbf{\Sexpr{tabK$k[3]}} & \textbf{\Sexpr{tabK$df[3]}} & $\mathbf{\Sexpr{round(tabK$logLik[3], 1)}}$ & \textbf{\Sexpr{round(tabK$bic[3], 1)}} \\
    restricted (mean-variance) & \Sexpr{tabK$k[4]} & \Sexpr{tabK$df[4]} & $\Sexpr{format(round(tabK$logLik[4], 1), nsmall = 1)}$ & \Sexpr{round(tabK$bic[4], 1)} \\
    \hline
  \end{tabular}
  \caption{DIF detection by selecting the number of latent classes $\hat K$ using the restricted Rasch mixture model.}
  \label{tab:selectK}
\end{table}

\begin{table}[t!]
  \centering
  \begin{tabular}{lrrrr}
    \hline
    Model & k & \#Df & $\log L$ & BIC \\
    \hline
    saturated & \Sexpr{tabS$k[1]} & \Sexpr{tabS$df[1]} & $\Sexpr{round(tabS$logLik[1], 1)}$ & \Sexpr{round(tabS$bic[1], 1)} \\
    restricted (saturated) & \Sexpr{tabS$k[2]} & \Sexpr{tabS$df[2]} & $\Sexpr{round(tabS$logLik[2], 1)}$ & \Sexpr{round(tabS$bic[2], 1)} \\
    mean-variance & \Sexpr{tabS$k[3]} & \Sexpr{tabS$df[3]} & $\Sexpr{round(tabS$logLik[3], 1)}$ & \Sexpr{round(tabS$bic[3], 1)} \\
    \textbf{restricted (mean-variance)} & \textbf{\Sexpr{tabS$k[4]}} & \textbf{\Sexpr{tabS$df[4]}} & $\mathbf{\Sexpr{round(tabS$logLik[4], 1)}}$ & \textbf{\Sexpr{round(tabS$bic[4], 1)}} \\
    \hline
  \end{tabular}
  \caption{Selection of the score distribution given the number of latent classes $\hat K = 3$.}
  \label{tab:selectScores}
\end{table}

As $K = 3$ is identical for all of these four models, standard likelihood ratio
tests can be used for comparing all nested models with each other. 
%The results for the verbal aggression data are shown in Figure~\ref{fig:verbalTest}. 
Testing the most parsimonious score model, the restricted mean-variance model, 
against its unrestricted version and the restricted saturated model at a 5\% 
level shows that a more flexible score model does not yield a significantly 
better fit. The p-value are \Sexpr{round(lrtest(mvR3, mv3)[2,5], 3)} and 
\Sexpr{round(lrtest(mvR3, satR3)[2,5], 3)}, respectively. Hence, the restricted 
mean-variance distribution is adopted here which also has the lowest BIC. 

To visualize how the three classes found in the data differ, the corresponding 
item profiles are shown in Figure~\ref{fig:verbalItems}.

\begin{itemize}
  \item The latent class in the right panel (with 
    \Sexpr{clsizes[3]}~observations) shows a very regular zig-zag-pattern 
    where for any type of verbally aggressive response actually \dquote{doing} 
    the response is considered more extreme than just \dquote{wanting} to 
    respond a certain way as represented by the higher item parameters for the 
    second item, the \dquote{do-item}, than the first item, the 
    \dquote{want-item}, of each pair. The three types of response (cursing,
    scolding, shouting) are considered increasingly aggressive, regardless of
    the situation (first six items vs.\ last six items). 
  \item The latent class in the left panel (with 
    \Sexpr{clsizes[1]}~observations) distinguishes more strongly between the 
    types of response. However, the relationship between wanting and doing is 
    reversed for all responses except shouting. It is more difficult to agree to
    the item \dquote{I want to curse/scold} than to the corresponding item 
    \dquote{I do curse/scold}. This could be interpreted as generally more 
    aggressive behavior where one is quick to react a certain way rather than 
    just wanting to react that way. However, shouting is considered a very 
    aggressive response, both in wanting and doing.
  \item The remaining latent class (with \Sexpr{clsizes[2]}~observations 
    considerably smaller), depicted in the middle panel, does not distinguish 
    that clearly between response types, situations or wanting vs.\ doing.
\end{itemize}

\setkeys{Gin}{width=\textwidth}
\begin{figure}[t!]
\centering
%
<<VerbalItemPlot, echo = FALSE, fig = TRUE, width = 10, height = 4.3>>=
trellis.par.set(theme = standard.theme(color = FALSE))
xyplot(mvR3)
@ 
%
\caption{Item profiles for the Rasch mixture model with $\hat K = 3$ latent 
  classes using a restricted mean-variance score distribution for the verbal 
  aggression data.
  \label{fig:verbalItems}}
\end{figure}


Therefore, not just a single item or a small number of items have DIF but the
underlying want/do relationship of the items is different across the three
classes. This instrument thus works differently as a whole across classes.

In summary, the respondents in this study are not scalable to one single 
Rasch-scale but instead need several scales to represent them accurately. A 
Rasch mixture model with a restricted score distribution is used to estimate the
number of latent classes. Given that number of classes, any type of score model 
is conceivable. Here, the various versions are all fairly similar and the
restricted mean-variance specification is chosen based on likelihood ratio 
tests. Keep in mind that the resulting fits can be substantially different from
each other as shown in the simulation study, in particular for the case of 
impact between DIF classes. The latent classes estimated here differ mainly in 
their perception of the type and the \dquote{want/do}-relationship of a verbally
aggressive response.

%\pagebreak

\section{Conclusion}\label{sec:conclusion}

Unlike in a single Rasch model, item parameter estimation is not independent of 
the score distribution in Rasch mixture models. The saturated and mean-variance
specification of the score model are both well-established. A further option is
the new restricted score specification introduced here. In the context of DIF
detection, only the restricted score specification should be used as it prevents
confounding effects of impact on DIF detection while exhibiting hit rates
positively related to DIF effect size. Given the number of latent classes, it
may be useful to fit the other score models as well, as they might improve
estimation of group membership and therefore estimation of the item parameters.
The best fitting model can be selected via the likelihood ratio test or an 
information criterion such as the BIC. This approach enhances the suitability 
of the Rasch mixture model as a tool for DIF detection as additional information
contained in the score distribution is only employed if it contributes to the 
estimation of latent classes based on measurement invariance.

%% move this section to comments for a pkg release?
\section*{Computational details}

<<sessionInfo, echo = FALSE>>=
session <- sessionInfo()
Rversion <- paste(session$R.version$major, session$R.version$minor, sep = ".")
psyversion <- session$otherPkgs$psychomix$Version
@ 

An implementation of all versions of the Rasch mixture model mentioned here is 
freely available under the General Public License in the \proglang{R} package 
\pkg{psychomix} from the Comprehensive \proglang{R} Archive Network.
Accompanying the package at \url{http://CRAN.R-project.org/package=psychomix} 
is a dataset containing all the simulation results which were generated using 
\proglang{R} version~3.0.2, \pkg{psychomix} version~1.1-0, and \pkg{clv} 
version~0.3-2. This vignette was generated with \proglang{R} 
version~\Sexpr{Rversion}, and \pkg{psychomix} version~\Sexpr{psyversion}.



\section*{Acknowledgments}

This work was supported by the Austrian Ministry of Science BMWF as part of the 
UniInfrastrukturprogramm of the Focal Point Scientific Computing at
Universit{\"a}t Innsbruck.

\nocite{psychomix:Fischer+Molenaar:1995}
\nocite{psychomix:Wainer+Braun:1988}
\bibliography{psychomix}

\end{document}


%% \begin{appendix}
  
%% \section{Web appendix}
%% \label{sec:appendix}

%% This web appendix provides the tables underlying the figures from the main
%% manuscript. \emph{It is intended for the online supplements only (i.e., not for printing).}

%% \begin{table}[h!]
%%   \centering
  
%% <<A:DIF, echo = FALSE, results = tex, eval = FALSE>>=
%% # library("xtable")
%% sc2all <- subset(scoresim, theta == 0, select = c("delta", "theta", "scores", "prop23"))

%% sc2 <- subset(sc2all, scores == "saturated", select = c("delta", "theta"))
%% sc2$saturated <- subset(sc2all, scores == "saturated")$prop23*100
%% sc2$meanvar <- subset(sc2all, scores == "meanvar")$prop23*100
%% sc2$restricted <- subset(sc2all, scores == "restricted")$prop23*100

%% colnames(sc2) <- c("$\\Delta$", "$\\Theta$", "Saturated", "Mean-variance", "Restricted")
%% rownames(sc2) <- NULL

%% print(xtable(sc2, digits = 1, align = "rccrrr"), include.rownames = FALSE, sanitize.text.function = function(x){x}, floating = FALSE)
%% @
%%   \caption{Proportion of choosing more than one latent class (in \%). Scenario 2: DIF without impact.}
%%   \label{app:scenario2}

%% \bigskip

%% <<A:Impact, echo = FALSE, results = tex, eval = FALSE>>=
%% sc3all <- subset(scoresim, delta == 0, select = c("delta", "theta", "scores", "prop23"))

%% sc3 <- subset(sc3all, scores == "saturated", select = c("delta", "theta"))
%% sc3$saturated <- subset(sc3all, scores == "saturated")$prop23*100
%% sc3$meanvar <- subset(sc3all, scores == "meanvar")$prop23*100
%% sc3$restricted <- subset(sc3all, scores == "restricted")$prop23*100
              
%% colnames(sc3) <- c("$\\Delta$", "$\\Theta$", "Saturated", "Mean-variance", "Restricted")
%% rownames(sc3) <- NULL

%% print(xtable(sc3, digits = 1, align = "rccrrr"), include.rownames = FALSE, sanitize.text.function = function(x){x}, floating = FALSE)
%% @ 
%%   \caption{Proportion of choosing more than one latent class (in \%). Scenario 3: Impact without DIF.}
%%   \label{app:scenario3}
%% \end{table}


%% <<A:notCoinciding, echo = FALSE, results = tex, eval = FALSE>>=
%% ## impact == 2.4
%% sc4all <- subset(scoresim, theta == 2.4 & (scenario == 4 | delta == 0), select = c("delta", "theta", "scores", "prop23"))

%% sc4 <- subset(sc4all, scores == "saturated", select = c("delta", "theta"))
%% sc4$saturated <- subset(sc4all, scores == "saturated")$prop23*100
%% sc4$meanvar <- subset(sc4all, scores == "meanvar")$prop23*100
%% sc4$restricted <- subset(sc4all, scores == "restricted")$prop23*100

%% ## impact == 3.6
%% sc4all <- subset(scoresim, theta == 3.6 & (scenario == 4 | delta == 0), select = c("delta", "theta", "scores", "prop23"))

%% sc4.36 <- subset(sc4all, scores == "saturated", select = c("delta", "theta"))
%% sc4.36$saturated <- subset(sc4all, scores == "saturated")$prop23*100
%% sc4.36$meanvar <- subset(sc4all, scores == "meanvar")$prop23*100
%% sc4.36$restricted <- subset(sc4all, scores == "restricted")$prop23*100

%% ## combine
%% sc4 <- rbind(sc4, sc4.36)

%% colnames(sc4) <- c("$\\Delta$", "$\\Theta$", "Saturated", "Mean-variance", "Restricted")
%% rownames(sc4) <- NULL

%% print(xtable(sc4, caption = "Proportion of choosing more than one latent class (\\%). Scenario 4: Impact and DIF, not coinciding.", label = "app:scenario4", digits = 1, align = "rccrrr"), include.rownames = FALSE, sanitize.text.function = function(x){x}, table.placement = "p!")
%% @ 


%% <<A:Overselection, echo = FALSE, results = tex, eval = FALSE>>=
%% over <- subset(scoresim, theta == 3.6 & scores == "meanvar", select = c("delta", "theta", "scenario", "prop2", "prop3"))

%% over$prop2 <- over$prop2*100
%% over$prop3 <- over$prop3*100

%% over$scenario[over$delta == 0] <- NA
%% over$scenario <- factor(over$scenario, levels = 4:5, labels = c("not coinciding", "coinciding"))

%% colnames(over) <- c("$\\Delta$", "$\\Theta$", "Groups", "Correct selection", "Overselection")
%% rownames(over) <- NULL

%% print(xtable(over, caption = "Proportions of selecting the correct number of classes and overselecting the number of classes (in \\%). Impact~$\\Theta~=~3.6$. Score model: mean-variance.", label = "app:overselection", digits = 1, align = "rcccrr"), include.rownames = FALSE, sanitize.text.function = function(x){x}, table.placement = "p!")
%% @ 


%% <<A:coinciding, echo = FALSE, results = tex, eval = FALSE>>=
%% ## impact == 2.4
%% sc5all <- subset(scoresim, theta == "2.4" & (scenario == 5 | delta == 0), select = c("delta", "theta", "scores", "prop23"))

%% sc5 <- subset(sc5all, scores == "saturated", select = c("delta", "theta"))
%% sc5$saturated <- subset(sc5all, scores == "saturated")$prop23*100
%% sc5$meanvar <- subset(sc5all, scores == "meanvar")$prop23*100
%% sc5$restricted <- subset(sc5all, scores == "restricted")$prop23*100

%% ## impact == 3.6
%% sc5all <- subset(scoresim, theta == 3.6 & (scenario == 5 | delta == 0), select = c("delta", "theta", "scores", "prop23"))

%% sc5.36 <- subset(sc5all, scores == "saturated", select = c("delta", "theta"))
%% sc5.36$saturated <- subset(sc5all, scores == "saturated")$prop23*100
%% sc5.36$meanvar <- subset(sc5all, scores == "meanvar")$prop23*100
%% sc5.36$restricted <- subset(sc5all, scores == "restricted")$prop23*100

%% ## combine
%% sc5 <- rbind(sc5, sc5.36)

%% colnames(sc5) <- c("$\\Delta$", "$\\Theta$", "Saturated", "Mean-variance", "Restricted")
%% rownames(sc5) <- NULL

%% print(xtable(sc5, caption = "Proportion of choosing more than one latent class (in \\%). Scenario 5: Impact and DIF, coinciding.", label = "app:scenario5", digits = 1, align = "rccrrr"), include.rownames = FALSE, sanitize.text.function = function(x){x}, table.placement = "p!")
%% @ 


%% <<A:notCoinciding:Rand, echo = FALSE, results = tex, eval = FALSE>>=
%% ## impact == 0.4
%% sc4all <- subset(scoresim, theta == 0.4 & scenario == 4, select = c("delta", "theta", "scores", "rand2"))

%% sc4 <- subset(sc4all, scores == "saturated", select = c("delta", "theta"))
%% sc4$saturated <- subset(sc4all, scores == "saturated")$rand2*100
%% sc4$meanvar <- subset(sc4all, scores == "meanvar")$rand2*100
%% sc4$restricted <- subset(sc4all, scores == "restricted")$rand2*100

%% ## impact == 3.6
%% sc4all <- subset(scoresim, theta == 3.6 & scenario == 4, select = c("delta", "theta", "scores", "rand2"))

%% sc4.36 <- subset(sc4all, scores == "saturated", select = c("delta", "theta"))
%% sc4.36$saturated <- subset(sc4all, scores == "saturated")$rand2*100
%% sc4.36$meanvar <- subset(sc4all, scores == "meanvar")$rand2*100
%% sc4.36$restricted <- subset(sc4all, scores == "restricted")$rand2*100

%% ## combine
%% sc4 <- rbind(sc4, sc4.36)

%% colnames(sc4) <- c("$\\Delta$", "$\\Theta$", "Saturated", "Mean-variance", "Restricted")
%% rownames(sc4) <- NULL

%% print(xtable(sc4, caption = "Rand index (in \\%). Scenario 4: Impact and DIF, not coinciding.", label = "app:scenario4:Rand", digits = 1, align = "rccrrr"), include.rownames = FALSE, sanitize.text.function = function(x){x}, table.placement = "p!")
%% @ 


%% <<A:coinciding:Rand, echo = FALSE, results = tex, eval = FALSE>>=
%% ## impact == 0.4
%% sc5all <- subset(scoresim, theta == "0.4" & scenario == 5, select = c("delta", "theta", "scores", "rand2"))

%% sc5 <- subset(sc5all, scores == "saturated", select = c("delta", "theta"))
%% sc5$saturated <- subset(sc5all, scores == "saturated")$rand2*100
%% sc5$meanvar <- subset(sc5all, scores == "meanvar")$rand2*100
%% sc5$restricted <- subset(sc5all, scores == "restricted")$rand2*100

%% ## impact == 3.6
%% sc5all <- subset(scoresim, theta == 3.6 & scenario == 5, select = c("delta", "theta", "scores", "rand2"))

%% sc5.36 <- subset(sc5all, scores == "saturated", select = c("delta", "theta"))
%% sc5.36$saturated <- subset(sc5all, scores == "saturated")$rand2*100
%% sc5.36$meanvar <- subset(sc5all, scores == "meanvar")$rand2*100
%% sc5.36$restricted <- subset(sc5all, scores == "restricted")$rand2*100

%% ## combine
%% sc5 <- rbind(sc5, sc5.36)

%% colnames(sc5) <- c("$\\Delta$", "$\\Theta$", "Saturated", "Mean-variance", "Restricted")
%% rownames(sc5) <- NULL

%% print(xtable(sc5, caption = "Rand index (in \\%). Scenario 5: Impact and DIF, coinciding.", label = "app:scenario5:Rand", digits = 1, align = "rccrrr"), include.rownames = FALSE, sanitize.text.function = function(x){x}, table.placement = "p!")
%% @ 

%% \end{appendix}