% Packages \documentclass[a4paper]{article} % \usepackage{floatrow} % \usepackage{url} % \usepackage{enumitem} \usepackage{authblk} \usepackage{amssymb} \usepackage{amsmath} \usepackage{amsthm} % \usepackage{rotating} \usepackage[left=2.5cm, right=2.5cm, top=3.5cm, bottom=3cm]{geometry} \usepackage{setspace} %\usepackage[onehalfspacing]{setspace} \usepackage[english]{babel} \usepackage{bbold} % \usepackage{graphicx} % \usepackage{algpseudocode} % \usepackage{algorithm} % \renewcommand{\algorithmicrequire}{\textbf{Input:}} % \renewcommand{\algorithmicensure}{\textbf{Output:}} % \newcommand{\Break}{\State \textbf{break} } % \renewcommand{\Return}{\State \textbf{return} } % \algrenewcomment[1]{\(\hfill\backslash\backslash\) #1} % %\algnewcommand{\LineComment}[1]{\State \(\backslash\backslash\) #1} % % \usepackage{caption}[2008/08/24] % % \captionsetup{margin=10pt,font=small,tableposition=b} % \usepackage{subcaption} % %\captionsetup[subfigure]{labelfont=rm} \usepackage{hyperref} % \usepackage{color} % \newcommand{\red}{\color{red}} % \newcommand{\black}{\color{black}} % \newcommand{\blue}{\color{blue}} \usepackage[authoryear,round]{natbib} \numberwithin{equation}{section} \newcounter{satze} \numberwithin{satze}{section} \theoremstyle{plain} \newtheorem{Proposition}[satze]{Proposition} \newtheorem{Lemma}[satze]{Lemma} \newtheorem{Corollary}[satze]{Corollary} \newtheorem{Theorem}[satze]{Theorem} \theoremstyle{definition} \newtheorem{Definition}[satze]{Definition} \newtheorem{Assumptions}[satze]{Assumptions} \newtheorem{Model}[satze]{Model} \newtheorem{Example}[satze]{Example} \theoremstyle{remark} \newtheorem{remark}[satze]{Remark} \parindent 0pt \newcommand{\A}{\mathcal{A}} %Algebra \newcommand{\C}{\mathbb{C}} \newcommand{\R}{\mathbb{R}} \newcommand{\Q}{\mathbb{Q}} \newcommand{\Z}{\mathbb{Z}} \newcommand{\N}{\mathbb{N}} \newcommand{\dx}{\mathrm{dx}} \newcommand{\dt}{\mathrm{dt}} \newcommand{\Var}{\operatorname{Var}} %Varianz \newcommand{\Cov}{\operatorname{Cov}} %Covarianz \newcommand{\E}{\mathbb{E}} %Erwartungswert \newcommand{\MSE}{\operatorname{MSE}} \newcommand{\MAE}{\operatorname{MAE}} \newcommand{\EINS}{\mathbb{1}} %Indikatorfunktion \newcommand{\B}{\mathcal{B}} \newcommand{\OO}{\mathcal{O}} \newcommand{\oo}{\hbox{o}} \newcommand{\Pj}{\mathbb{P}} %Projektives P \newcommand{\Px}{P} \newcommand{\Nor}{\mathcal N} \newcommand\argmin{\operatornamewithlimits{argmin}} \newcommand\argmax{\operatornamewithlimits{argmax}} \newcommand{\dd}{\mathrm{d}} \newcommand{\valmu}{m} \newcommand{\valsd}{s} \newcommand{\sn}{{d_n}} \newcommand{\snn}{{d_n}} \newcommand{\tildesn}{{\widetilde{d}_n}} \newcommand{\HSMUCE}{\operatorname{H-SMUCE}} \newcommand{\SMUCE}{\operatorname{SMUCE}} \newcommand{\JSMURF}{\operatorname{J-SMURF}} \raggedbottom % References % \usepackage[backend=bibtex, sorting=none]{biblatex} % \bibliography{references} % \usepackage{csquotes} % \makeatletter % \def\blx@maxline{77} % \makeatother % % % % Authors \author[1]{Florian Pein} \author[1,2]{Axel Munk} % \author[1]{Briant M. Bot} % \affil[1]{Institute for Mathematical Stochastics, Georg-August-University of Goettingen, Goldschmidtstr. 7, 37077 G\"ottingen, Germany} \affil[2]{Max Planck Institute for Biophysical Chemistry, Am Fassberg 11, 37077 G\"ottingen, Germany} % % \renewcommand\Authands{ and } % \renewcommand\Affilfont{\fontsize{9}{10.8}\itshape} \begin{document} %\SweaveOpts{concordance=TRUE} %\SweaveOpts{concordance=TRUE} %\VignetteIndexEntry{R package stepR} %\VignetteEngine{knitr::knitr} \title{R package \emph{stepR} \\ \emph{Multiscale change-point inference}} \maketitle % Chunk setup <>= # fig.width=5.5, library(knitr) # opts_chunk$set(fig.align='center', fig.height=5, warning=FALSE, # dev='pdf', prompt=TRUE, comment=NA, highlight=FALSE, tidy=FALSE) library(stepR) savePathRcache <- R.cache::getCacheRootPath() R.cache::setCacheRootPath(path = file.path(R.cache::getCacheRootPath(), "test")) @ \section{Introduction} Multiple change-point detection (fitting a piecewise constant function to serial observations is relevant to many applications, for instance to genetics or finance. Accordingly, this is a long standing task in statistical research and related areas. In addition to estimation / regression, statistical inference (obtaining confidence statements) for locations and size of segments is of high practical importance. The \textbf{stepR} package provides both by implementing the multiscale regression estimators $\operatorname{SMUCE}$ from \cite{smuce} and $\operatorname{HSMUCE}$ from \cite{hsmuce}. See the introduction of these two papers for a biblography, more details on the history, theory and applications of multiple change-point methods.\\ The regression model underlying the methodolodgy is described in Section \ref{sec:model}. The multiscale regression estimator, its confidence statements and technical quantities are introduced in Section \ref{sec:methodolodgy}. The functions of this package together with small examples are listed in Section \ref{sec:methods}. Finally, Sections \ref{sec:family} and \ref{sec:intervalsystem} give an overview about the supported parametric families and interval systems, respectively. \section{Model}\label{sec:model} This package deals with the estimation of a piecewise constant (step) function \begin{equation}\label{eq:signal} \gamma(t)=\sum_{k=0}^{K}{\gamma_k \EINS_{[\tau_k,\tau_{k+1})}(t)}, \end{equation} where the number of change-points $K\in\N_0$, the change-point locations $\tau_1<\cdots <\tau_{K}$ and the function values $\gamma_0,\ldots,\gamma_K\in\R$ are all unknown and have to be estimated from the data. Start and end point $\tau_0$ and $\tau_{K+1}$ are known. In other words, the set of all possible candidate functions is \begin{equation}\label{eq:candidateset} \Gamma:=\{\gamma(t)=\sum_{k=0}^{K}{\gamma_k \EINS_{[\tau_k,\tau_{k+1})}(t)},\ K\in\N_0,\ \tau_0<\cdots <\tau_{K+1},\ \gamma_0,\ldots,\gamma_K\in\R\}. \end{equation} To this end, it is assumed that a vector of length $n\in\N_0$ of noisy observations $y=(y_1, \ldots,y_n)$ from a known \textit{parametric family} $\mathcal{P}$ is given, where the function values of $\gamma$ at known design points $x=(x_1,\ldots,x_n)$ are (unknown) parameters of the family, i.e. \begin{equation} y=(y_1,\ldots,y_n)\sim \mathcal{P}(\gamma)=\mathcal{P}((\gamma(x_1), \ldots, \gamma(x_n))). \end{equation} For instance, the (homogeneous) gaussian regression model with equidistant design points $x_i=i/n$ \[y_i=\mu(i/n)+\sigma_0\epsilon_i,\quad i=1,\ldots,n\] is supported. For an overview about the supported parametric families see Section \ref{sec:family}. <>= set.seed(1) n <- 100L x <- seq(1 / n, 1, 1 / n) mu <- stepfit(cost = 0, family = "gauss", value = c(0, 3, 0, -2, 0), param = NULL, leftEnd = x[c(1, 21, 26, 71, 81)], rightEnd = x[c(20, 25, 70, 80, 100)], x0 = 0, leftIndex = c(1, 21, 26, 71, 81), rightIndex = c(20, 25, 70, 80, 100)) sigma0 <- 0.5 epsilon <- rnorm(n, 0, sigma0) y <- fitted(mu) + epsilon plot(x, y, pch = 16, col = "grey30", ylim = c(-3, 4)) lines(mu, lwd = 3) @ \section{Methodolodgy}\label{sec:methodolodgy} The \textit{multiscale regression estimator} $\hat{\gamma}$ is defined as the restricted maximizer of a functional $L^\mathcal{P}(Y)$ (for most parametric families $L^\mathcal{P}$ is the likelihood function, the exact functional for each parametric family is given in Section \ref{sec:family}), where the candidate set $\Gamma$ is restricted to all solutions of the (non-convex) optimisation problem to minimize the number of change-points under the constraint that the candidate function is accepted by a multiscale test. In formulas, the multiscale regression estimator $\hat{\gamma}$ is defined by \begin{equation}\label{eq:estimator} \hat{\gamma}:=\operatorname*{argmax}_{\gamma\in C(y, q)}{L^\mathcal{P}(y, \gamma)} \end{equation} with \begin{equation}\label{eq:optimisationproblem} C(y, q):=\{\operatorname*{argmin}_{\gamma\in\Gamma}{\vert \gamma\vert_0}\text{ s.t. } T_n(y,\gamma,q)\leq 0\}, \end{equation} with $\vert \gamma\vert_0$ the number of change-points of $\gamma$, $q=(q_1,\ldots,q_\sn)$ a vector of \textit{critical values} (see Section \ref{sec:critval}), and \textit{multiscale test} \begin{equation}\label{eq:multiscaletest} T_n(y,\gamma,q):=\max_{I\in\mathcal{I}:\ \gamma_{\vert I}\equiv \gamma_I}{T_I^{\mathcal{P}}(y, \gamma_I) - q_{\vert I \vert_0}}. \end{equation} Here $\mathcal{I}$ is the \textit{interval set} on which the candidate function is tested (to be defined in Section \ref{sec:intervalset}), $\gamma_{\vert I}$ denotes the function $\gamma$ restricted to the interval $I$, $T_I^{\mathcal{P}}$ is a \textit{local test statistic} depending on the parametric family (often the corresponding likelihood ratio test, the exact test statistic for each parametric family is given in Section \ref{sec:family}) and $\vert I \vert_0$ is the number of observations on the interval $I$, i.e. $\vert I \vert_0:=\vert \{i\in\{1,\ldots,n\}:x_i\in I\}\vert$.\\ Minimizing the number of change-point in \eqref{eq:optimisationproblem} aims to seek for the most persimonious solution, whereas the multiscale constraint aims to detect all true changes of the underlying parameter by guaranteeing that the estimate $\hat{\gamma}$ describes the data everywhere locally well by testing on each interval (of the interval set $\mathcal{I}$) whether the function value corresponds to the parameter underlying the data. More precisely, on each interval $I\in\mathcal{I}$ on which the candidate function $\gamma$ is constant with value $\gamma_I$ the hypothesis $\tilde{\gamma}=\gamma_I$ versus the alternative $\tilde{\gamma}\neq\gamma_I$ is tested assuming the observations $\{y_i: x_i\in I,\ i = 1,\ldots, n\}$ have parameter $\tilde{\gamma}$.\\ The set $C(y, q)$ \eqref{eq:optimisationproblem} is an asymptotic confidence set for $\gamma$ at level $1-\alpha$ if $q$ satisfies \eqref{eq:levelalpha}, see \cite[corollary 2]{smuce} and \cite[theorem 3.8]{hsmuce}, but difficult to visualise. Therefore, simultaneous \textit{confidence intervals} $[L_k,R_k]$ for the change-point locations $\tau_k$ and a \textit{confidence band} $B(q)$ for $\gamma$ are provided, see \cite[section 3.2]{smuce} and \cite[supplement section A.1]{hsmuce} as well as \cite[theorem 7 and corollary 3]{smuce} and \cite[theorem 3.9]{hsmuce} for theoretical verification, in particular it is shown that the union $I(q)=\{\hat{K}, B(q), [L_k,R_k]_{k=1,\ldots,\hat{K}}\}$ is sequentially honest with respect to certain subspaces $\Gamma_n$, see \cite[corollary 3]{smuce} for details. <>= fit <- stepFit(y, x = x, alpha = 0.5, jumpint = TRUE, confband = TRUE) plot(x, y, pch = 16, col = "grey30", ylim = c(-3, 4)) lines(mu, lwd = 3) lines(fit, lwd = 3, col = "red", lty = "22") # confidence intervals for the change-point locations points(jumpint(fit), col = "red") # confidence band lines(confband(fit), lty = "22", col = "darkred", lwd = 2) @ \subsection{Interval set}\label{sec:intervalset} The user can choose the interval set $\mathcal{I}$ by specifying an \textit{interval system}, see Section \ref{sec:intervalsystem} for an overview about the supported interval systems, and a set of \textit{lengths} $l=\{l_1,\ldots,l_\sn\}\subseteq \{1,\ldots,n\}$. Here, $\sn$ denotes the \textit{number of scales}, i.e. the number of different interval lengths on which will be tested. The interval set will contain all intervals of the interval system with a length that is in the set of lengths. Note that not all lengths are possible for each interval system (see Section \ref{sec:intervalsystem} for which ones are possible for which system) and also not for each parametric family (see Section \ref{sec:family} for which ones are possible for which family). \subsection{Critical values, penalisation and Monte-Carlo simulations}\label{sec:critval} The vector of critical values $q=(q_1,\ldots,q_\sn)$ is chosen such that the multiscale test is a test at \textit{significance level} $\alpha\in(0,1)$, i.e. for $\gamma_0\equiv 0$ and $z\sim\mathcal{P}(\gamma_0)$ \begin{equation}\label{eq:levelalpha} \Pj(T_n(z,\gamma_0,q) < 0)\leq \alpha \end{equation} is required. \eqref{eq:levelalpha} implies the overestimation control that the probability to overestimate the number of change-points is uniformly over all candidate functions bounded by the significance level $\alpha$, i.e. \begin{equation}\label{eq:overestimationcontrol} \sup_{\gamma \in \Gamma} \Pj_{\gamma}(\hat{K}>K)\leq \alpha. \end{equation} Here $\Pj_{\gamma}$ denotes that $\hat{K}$ is estimated based on $y\sim\mathcal{P}(\gamma)$, see \cite[(17)]{smuce} and \cite[theorem 3.3]{hsmuce}. Hence, if a strict overestimation control of the number of change-points $K$ is desirable $\alpha$ should be chosen small, e.g. $0.05$ or $0.1$. This might come at the expense of missing (smaller) change-points but with large probability not detecting too many. If change-point screening is the primarily goal, i.e. missing of change-points should be avoided, $\alpha$ should be increased, e.g. $\alpha = 0.5$ or even higher, since the error probability to underestimate the number of change-points decreases with increasing $\alpha$, see \cite[theorem 2 and (23)]{smuce} and \cite[theorem 3]{hsmuce}. If model selection, i.e. $\hat{K}=K$, is the major aim, an intermediate level that balances the over- and underestimation error should be chosen, e.g. $\alpha$ between $0.1$ and $0.5$. For a detailed discussion of the choice of $\alpha$ see \cite[section 4]{smuce} and \cite[section 3.4]{hsmuce}. This package offers two approaches to satisfy \eqref{eq:levelalpha} and to balance different scales (statistics on intervals of different lengths): \textit{scale penalisation} and \textit{balancing by weights}. \subsubsection{Scale penalisation}\label{sec:scalepenalisation} This approach, proposed in \cite{smuce}, balances different scales by a penalty function $p_{\vert I \vert_0, n}(\cdot)$ leading to the \textit{penalised multiscale statistic} \begin{equation}\label{eq:penalisedstatistic} T_n^{p_{\vert I \vert_0, n}}(y,\gamma):=\max_{I\in\mathcal{I}:\ \gamma_{\vert I}\equiv \gamma_I}{p_{\vert I \vert_0, n}(T_I^{\mathcal{P}}(y, \gamma_I))} \end{equation} and the \textit{multiscale vector of penalised statistics} \begin{equation}\label{eq:statscalepen} T_n^{l,p_{\vert I \vert_0, n}}(y,\gamma):=\left(\max_{\stackrel{I\in\mathcal{I}:\ \vert I \vert_0 = l_1,}{\gamma_{\vert I}\equiv \gamma_I}}{p_{\vert I \vert_0, n}(T_I^{\mathcal{P}}(y, \gamma_I))},\ldots,\max_{\stackrel{I\in\mathcal{I}:\ \vert I \vert_0 = l_\sn,}{\gamma_{\vert I}\equiv \gamma_I}}{p_{\vert I \vert_0, n}(T_I^{\mathcal{P}}(y, \gamma_I))}\right). \end{equation} Then, the \textit{global quantile} $q_\alpha$ at significance level $\alpha$ is defined as the $(1-\alpha)$ quantile of $T_n^{p_{\vert I \vert_0, n}}(z,\gamma_0)$ and the critical values are determined by $q_i=p_{i,n}^{-1}(q_\alpha)$. The most common penalisation $p_{\vert I \vert_0, n}(t)=\sqrt{2 t}-\sqrt{2\log(exp(1)n / \vert I \vert_0)}$, called "sqrt" in this package, implies for the families "gauss" and "mDependentPS" that \begin{equation}\label{eq:convergencepenalisedstat} T_n^{p_{\vert I \vert_0, n}}(y,\gamma) \stackrel{D}{\rightarrow} M, \end{equation} where $M$ is a functional of the standard Brownian motion and finite almost surely. For a precise definition and more details see \cite[section 2.2]{smuce}. See also its minimax optimality in \cite[theorems 5 and 6]{smuce}. Consequently, this penalty guarantees appropriate scale balancing and, hence, is the default penalty for these families. Note, that this does not apply to inhomogeneous Gaussian observations underlying the parametric family "hsmuce", hence, this penalty is not recommended here.\\ This package also supports the penalties $p_{\vert I \vert_0, n}(t)=t-\log(exp(1)n / \vert I \vert_0)$, called "log" and $p_{\vert I \vert_0, n}(t)=t$, called "none", where the statistics are not penalized and, hence, the multiscale test is dominated by smaller scales. For a discussion how these penalties influence the final outcome, see \cite[section 6.2]{smuce}. \subsubsection{Balancing by weights}\label{sec:weights} For this approach, proposed in \cite[section 2]{hsmuce} and chosen by penalty = "weights" (although this approach does not correspond to a penalty in a literal sense), the \textit{multiscale vector of statistics} is defined as \begin{equation}\label{eq:statscale} T_n^l(y,\gamma):=\left(\max_{\stackrel{I\in\mathcal{I}:\ \vert I \vert_0 = l_1,}{\gamma_{\vert I}\equiv \gamma_I}}{T_I^{\mathcal{P}}(y, \gamma)},\ldots,\max_{\stackrel{I\in\mathcal{I}:\ \vert I \vert_0 = l_\sn,}{\gamma_{\vert I}\equiv \gamma_I}}{T_I^{\mathcal{P}}(y, \gamma)}\right). \end{equation} Moreover, for given weights \begin{equation}\label{eq:weights} \beta_1,\ldots,\beta_\sn > 0,\text{ with }\sum_{k=1}^{\sn}{\beta_k}=1, \end{equation} in addition to \eqref{eq:levelalpha}, \begin{equation}\label{eq:balancing} \frac{1-F_1\left(q_1\right)}{\beta_1}=\cdots=\frac{1-F_\sn\left(q_{\sn}\right)}{\beta_\sn}, \end{equation} with $F_i$ the cumulative distribution function of the i-th entry of $T_n^l(z,\gamma_0)$, is required. The weights determine the fractions between the probabilities that a test on a certain scale rejects, and hence regulate the allocation of the level $\alpha$ among the single scales. Then, for any $\alpha\in(0,1)$ and for any weights $\beta_1,\ldots,\beta_\sn$, s.t. \eqref{eq:weights} holds, there exits a unique vector of critical values $q=(q_1,\ldots,q_\sn)\in\R_+^\sn$ which fulfils the equations \eqref{eq:levelalpha} and \eqref{eq:balancing}, see \cite[lemma 2.1]{hsmuce}. This vector can be computed by \cite[algorithm 1 in supplement A.2]{hsmuce} based on Monte-Carlo simulations. This penalty is the default and only meaningful penalty for the parametric family "hsmuce", but can also be used for other families. As default uniform weights $\beta_1=\cdots=\beta_n=1/n$ are proposed, but other choices can be used to incorporate prior knowledge on the scales, see also \cite[section 3.4]{hsmuce} for more details on the choice of the weights. \subsubsection{Monte-Carlo simulations}\label{sec:MCsim} Calculating the critical values analytically appears to be a very hard task, therefore for $y=z\in\mathcal{P}(\gamma_0)$ and $\gamma=\gamma_0$ the distribution of either the penalised multiscale statistic \eqref{eq:penalisedstatistic} or of the multiscale vector of statistics \eqref{eq:statscale}, where $l$ is the set of all lengths allowed by the interval system and by the parametric family, is simulated by Monte-Carlo simulations. This means to generate $r$, the \textit{number of repetitions}, independent samples of $z\sim\mathcal{P}(\gamma_0)$ and to compute for each the corresponding statistic. Resulting in a $\sn$ times $r$ matrix, containing $r$ independent simulations of the multiscale vector of statistics \eqref{eq:statscale}, an object of class \textit{"MCSimulationVector"}, or in an $r$ dimensional vector, containing $r$ independent simulations of the penalised multiscale statistic \eqref{eq:penalisedstatistic}, an object of class \textit{"MCSimulationMaximum"}. Since Monte-Carlo simulations can last very long, this package offers several possibilities to store them, see Section \ref{sec:storingMC} for more details. \subsection{Dynamic program}\label{sec:dynamicprogram} The multiscale regression estimator $\hat{\gamma}$ can be computed efficiently by a pruned dynamic program. The acceptance region of each local test is an interval with lower \textit{bound} $l_{I}$ and upper \textit{bound} $u_{I}$, i.e. \begin{equation}\label{eq:bounds} [l_I, u_I]:=\{\gamma_I\in\R:T_I^{\mathcal{P}}(y, \gamma_I) \leq q_{\vert I \vert_0}\}. \end{equation} Hence, to be accepted by the multiscale test a candidate function has to satisfy these constraints on all intervals of the interval set $\mathcal{I}$ on which it is constant. For more details on the computation of the estimator and its confidence statements based on these bounds see \cite[supplement section A.1]{hsmuce} and \cite[section 3]{smuce}. \subsection{Storing of the Monte-Carlo simulations}\label{sec:storingMC} Since a Monte-Carlo simulation lasts potentially much longer than the main calculations of the estimator, this package offers multiple possibilities for saving and loading the simulations. Both, an object of the class \textit{"MCSimulationVector"}, i.e. a $\sn$ times $r$ matrix, containing $r$ independent simulations of the multiscale vector of statistics \eqref{eq:statscale}, and an object of the class \textit{"MCSimulationMaximum"}, i.e. an $r$ dimensional vector, containing $r$ independent simulations of the penalised multiscale statistic \eqref{eq:penalisedstatistic} (for penalties "sqrt", "log" and "none"), can be simulated, saved and loaded. An object of the class \textit{"MCSimulationVector"} is more flexible, since critical values for all penalties and all set of lengths can be derived from it, but requires much more storage space and slightly larger storing and loading times. The Monte-Carlo simulations depend on the number of observations, the parametric family and the interval system, for \textit{"MCSimulationMaximum"} additionally the set of lengths and the used penalty matters. Note that Monte-Carlo simulations can only be saved and loaded if they are generated with the default function for generating the observations.\\ Monte-Carlo simulations can also be performed for a (slightly) larger number of observations $n_q$, this means to calculate $T_{n_q}^{p_{\vert I \vert_0, n_q}}(z,\gamma_0)$ or $T_{n_q}^l(z,\gamma_0)$ with $z=(z_1,\ldots,z_{n_q})\sim \mathcal{P}(\gamma_0)$, which avoids extensive resimulations for only a little bit varying number of observations. The overestimation control \eqref{eq:overestimationcontrol} is still satisfied but the detection power is (slightly) smaller. The number $n_q$ can be specified by the variable \textit{nq}, as default, the next larger dyadic number minus one, i.e. $n_q=2^{\lceil \log_2(n)\rceil} - 1$, is used. Additionally, the user can decide whether simulations (if required) will be performed with $n$ or with $n_q$, see below.\\ The simulations can either be stored persistently on the file system for which the package \textit{R.cache} is used or in the workspace in the global variable \textit{critValStepRTab}. Loading from the workspace is faster, but either the user has to store the workspace manually or in a new session simulations have to be performed again. Moreover, storing in and loading from variables and \textit{rds} files is supported. Finally, an initial collection of simulations can be accessed by installing the package \textit{stepRdata} available from \url{http://www.stochastik.math.uni-goettingen.de/stepRdata_1.0-0.tar.gz}.\\ Whenever computation of the critical values / the global quantile will be required, i.e. when the variable \textit{q} is not given, it will be looked for avaiable outcomes of the required Monte-Carlo simulations in the following order: They can explicitly be given in the variable \textit{stat}, they can be given as an \textit{rds} file in the variable \textit{RDSfile}, they can be loaded from the workspace, they can be loaded from the file system, they can be accessed from the \textit{stepRdata} package (if installed) and if no other option was available they will be simulated by the function \textit{monteCarloSimulation}. For the workspace and the file system it will first be looked for a vector of the penalised multiscale statistic in the workspace and then on the file system, afterwards in both for a matrix of the multiscale vector of statistics for $n$ observations, afterwards in both for a vector of the penalised multiscale statistic with $n_q$ observations and then for a matrix of the multiscale vector of statistics with $n_q$ observations. All searches can be disabled by specifying the variable \textit{options} accordingly, please see the documentation of the function \textit{critVal} for technical details.\\ The user can decide by the variable \textit{options}, too, whether a vector of the penalised multiscale statistic or matrix of the multiscale vector of statistics with $n$ or with $n_q$ observations will be simulated if simulations are required. Please take into account the explanations regarding computation time and flexibility from the beginning of this section for this choice.\\ All available simulations (either simulated, loaded or computed in the case of a vector of the penalised statistic) can be saved by all available options, please use again the variable \textit{options} for a decision which will be stored and see the documentation of the function \textit{critVal} for technical details.\\ As default, the matrix of the multiscale vector of statistics with $n_q$ observations will be stored on the file system and the penalised multiscale statistic in the work space. If required the matrix of the multiscale vector of statistics with $n_q$ observations will be simulated and it will be attempted to load the required simulations from any option in the previously described order. \section{Methods}\label{sec:methods} This section gives a brief overview about the available functions in the package, for more detailed information see the documentation of each function itself. \paragraph{stepFit} Computes the multiscale regression estimator $\hat{\gamma}$ \eqref{eq:estimator}, confidence intervals for the change-point locations and confidence bands for $\gamma$, see Section \ref{sec:methodolodgy}, by a pruned dynamic program, see Section \ref{sec:dynamicprogram}. The computation of the confidence intervals and the confidence band can be requested by the variables \textit{jumpint} and \textit{confband}, respectively. An example was given in Section \ref{sec:methodolodgy}. \paragraph{critVal} Computes the vector of critical values $q$ and the global quantile $q_\alpha$ at significance level $\alpha$ for a given penalisation, see Section \ref{sec:critval}. <>= # was called in stepFit, can be called explicitly, # for instance outside of a for loop to save computation time qVector <- critVal(length(y), alpha = 0.5) identical(stepFit(y, x = x, q = qVector, jumpint = TRUE, confband = TRUE), fit) qValue <- critVal(length(y), alpha = 0.5, output = "value") identical(stepFit(y, x = x, q = qValue, jumpint = TRUE, confband = TRUE), fit) @ \paragraph{computeStat} Computes the multiscale vector of penalised statistics $T_n^{l,p_{\vert I \vert_0, n}}(y,\gamma)$ \eqref{eq:statscalepen} and the penalised multiscale statistic $T_n^{p_{\vert I \vert_0, n}}(y,\gamma)$ \eqref{eq:penalisedstatistic} for given $\gamma\in\Gamma$. <>= # fit satisfies the multiscale contraint, i.e. # the computed penalized multiscale statistic is not larger than the global quantile computeStat(y, signal = fit, output = "maximum") < qValue # multiscale vector of statistics is componentwise not larger than # the vector of critical values all(computeStat(y, signal = fit, output = "vector") < qVector) @ \paragraph{computeBounds} Computes the multiscale contraint, i.e. the bounds \eqref{eq:bounds} for all intervals $I$ in the interval set $\mathcal{I}$. <>= # the multiscale contraint bounds <- computeBounds(y, alpha = 0.5) @ \paragraph{monteCarloSimulation} Performs Monte-Carlo simulations of the multiscale vector of statistics \eqref{eq:statscale} and of the penalised multiscale statistic \eqref{eq:penalisedstatistic} for $y=z\sim \mathcal{P}(\gamma_0)$ and $\gamma=\gamma_0$, see Section \ref{sec:MCsim}. <>= # monteCarloSimulation will be called in critVal, can be called explicitly # object of class MCSimulationVector stat <- monteCarloSimulation(n = length(y)) identical(critVal(n = length(y), alpha = 0.5, stat = stat), critVal(n = length(y), alpha = 0.5, options = list(load = list(), simulation = "matrix"))) # object of class MCSimulationMaximum stat <- monteCarloSimulation(n = length(y), output = "maximum") identical(critVal(n = length(y), alpha = 0.5, stat = stat), critVal(n = length(y), alpha = 0.5, options = list(load = list(), simulation = "vector"))) @ <>= unlink(R.cache::getCacheRootPath(), force = TRUE, recursive = TRUE) R.cache::setCacheRootPath(savePathRcache) @ Moreover, the functions \textit{compareBlocks}, \textit{neighbours}, \textit{sdrobnorm}, \textit{stepcand}, \textit{steppath}, \textit{stepsel} from version 1.0-0 are still available, please see the documentation of these functions for more details. \subsection{Deprecated functions} The following functions are deprecated, but still working, however, they might be removed in a further version. There are two groups of deprecated functions. First of all, the functions \textit{BesselPolynomial}, \textit{contMC}, \textit{dfilter}, \textit{jsmurf}, \textit{transit} are mainly used for patchclamp recordings and might be moved to a specialised package. Secondly, the functions \textit{MRC}, \textit{bounds}, \textit{smuceR}, \textit{stepbound} are deprecated and functions of the new version should be used instead. Please see the documentation of the functions itself for more information and how they can be replaced. \section{Parametric families}\label{sec:family} This package supports several parametric families (models and fitting methods). Please note that some require additional parameters. For technical details see also the documentation \textit{parametricFamily}. In addition to these families the function \textit{smuceR} supports "gaussvar", "poisson", "binomial", "gaussKern". They will be added to the new functions in a further version, too. \paragraph{"gauss"} This family implements the \textbf{S}imulataneous \textbf{MU}ltiscale \textbf{C}hange-point \textbf{E}stimator (SMUCE) \cite{smuce} for independent Gaussian observations. More precisely, independent normally distributed data $y$ with unknown mean function $\gamma:=\mu$ but known, constant standard deviation $\sigma_0>0$ are assumed, i.e. \begin{equation}\label{eq:modelgauss} y_i=\mu(x_i) + \sigma_0\epsilon_i, \quad i=1,\dots,n, \end{equation} with $\epsilon_1,\ldots,\epsilon_n$ independent standard normal distributed errors. The standard deviation $\sigma_0$ has to be either given by the user or will be estimated using the difference-based estimator \begin{equation}\label{eq:sdrobnormgauss} \frac{\operatorname{IQR}(\vert y_1-y_2\vert,\ldots, \vert y_{n-1} - y_n\vert)}{\sqrt{2}\operatorname{IQR}(\mathcal{N})} \end{equation} with $\operatorname{IQR}(x)$ the interquartile range of the sample $x$, i.e. the $75\%$-quantile minus the $25\%$-quantile of $x$, and $\operatorname{IQR}(\mathcal{N})$ the theoretical interquartile range of the standard normal distribution, i.e. the $75\%$-quantile minus the $25\%$-quantile of the standard normal distribution. This estimator is rather robust against changes of the underlying mean function and is implemented in the function \textit{sdrobnorm}. For this family the multiscale regression estimator $\hat{\gamma}$ is defined as the restricted maximum likelihood estimator, i.e. \begin{equation}\label{eq:gausslikelihood} L^\mathcal{P}(y, \gamma) := \sum_{i=1}^{n}{\big(y_i - \gamma(x_i)\big)^2}. \end{equation} The local test statistics are the statistics of the likelihood ratio test \begin{equation}\label{eq:gausslocaltests} T_I^{\mathcal{P}}(y, \gamma_I) := \frac{\vert I \vert_0\left(\overline{y}_I-\gamma_I\right)^2}{2\sigma^2}, \end{equation} with $\overline{y}_I:=\sum_{i:\ x_i\in I}{y_i}/\vert I \vert_0$, hence, the bounds are derived by \begin{equation}\label{eq:gaussbounds} [l_I, u_I] := \left[\overline{y}_I-\sigma\sqrt{\frac{2q_{\vert I \vert_0}}{\vert I \vert_0}},\overline{y}_I+\sigma\sqrt{\frac{2q_{\vert I \vert_0}}{\vert I \vert_0}}\right]. \end{equation} All lengths $\{1,\ldots,n\}$ are allowed for this family. \paragraph{"hsmuce"} This family implements the \textbf{H}eterogeneous \textbf{S}imulataneous \textbf{MU}ltiscale \textbf{C}hange-point \textbf{E}stimator (HSMUCE) \cite{hsmuce}. More precisely, independent normal distributed data $y$ with unknown mean function $\gamma:=\mu$ and also unknown standard deviation function $\sigma$ are assumed, i.e. \begin{equation}\label{eq:modelhsmuce} y_i=\mu(x_i) + \sigma(x_i)\epsilon_i, \quad i=1,\dots,n, \end{equation} with $\epsilon_1,\ldots,\epsilon_n$ independent standard normal distributed errors. Here, the unknown mean function is the parameter of interest, whereas the standard deviation is considered as a nuisance parameter. Moreover, the standard deviation is assumed to change only at the change-point locations of $\mu$, i.e. function pairs from the set \begin{equation}\label{eq:functionpairhsmuce} \left\{\big(\mu,\sigma\big)=\left(\sum_{k=0}^{K}{\mu_k \EINS_{[\tau_k,\tau_{k+1})}},\ \sum_{k=0}^{K}{\sigma_k \EINS_{[\tau_k,\tau_{k+1})}}\right),\ K\in\N_0,\ \tau_0<\cdots <\tau_{K+1},\ \mu_0,\ldots,\mu_K\in\R,\ \sigma_0,\ldots,\sigma_K>0\right\}, \end{equation} with $\mu_i\neq \mu_{i+1}$, but $\sigma_i=\sigma_{i+1}$ allowed, are assumed. For this family the multiscale regression estimator $\hat{\gamma}$ is defined as the restricted maximum likelihood estimator, i.e. \begin{equation}\label{eq:hsmucelikelihood} L^\mathcal{P}(y, \gamma) := \sum_{i=1}^{n}{\big(y_i - \gamma(x_i)\big)^2/\hat{\sigma}_i^2}, \end{equation} with $\hat{\sigma}_i=\sum_{j\in I^i}{(y_j-\gamma(x_i))^2}/\vert I^i\vert_0$, $I^i=\{j\in\{1,\ldots,n\}\ :\ \gamma(x_k)=\gamma(x_i)\ \forall\ k=j,\ldots,i\}$. The local test statistics are the statistics of the likelihood ratio test \begin{equation}\label{eq:hsmucelocaltests} T_I^{\mathcal{P}}(y, \gamma_I) := \frac{\vert I \vert_0\left(\overline{y}_I-\gamma_I\right)^2}{2\hat{\sigma}_I^2}, \end{equation} with $\overline{y}_I:=\sum_{i:\ x_i\in I}{y_i}/\vert I \vert_0$ and $\hat{\sigma}_I^2:=\sum_{i:\ x_i\in I}{(y_i-\overline{y}_I)^2}/(\vert I \vert_0 - 1)$, hence, the bounds are derived by \begin{equation}\label{eq:hsmucebounds} [l_I, u_I] := \left[\overline{y}_I-\hat{\sigma}_I\sqrt{\frac{2q_{\vert I \vert_0}}{\vert I \vert_0}},\overline{y}_I+\hat{\sigma}_I\sqrt{\frac{2q_{\vert I \vert_0}}{\vert I \vert_0}}\right]. \end{equation} All lengths larger than $1$, $\{2,\ldots,n\}$, are allowed for this family. \paragraph{"mDependentPS"} This family implements the \textbf{S}imulataneous \textbf{MU}ltiscale \textbf{C}hange-point \textbf{E}stimator (SMUCE) \cite{smuce} for m-dependent Gaussian observations with known covariance structure. More precisely, $m$-dependent normal distributed data $y$ with unknown mean function $\gamma:=\mu$ but known covariance matrix $\Sigma\in\R^{n\times n}$ are assumed, i.e. \begin{equation}\label{eq:mdependentpsgauss} y_i=\mu(x_i) + \epsilon_i, \quad i=1,\dots,n, \end{equation} with centred errors $\E[\epsilon_i]=0$ and known covariance $\Sigma_{i,j}=\Cov[\epsilon_i,\epsilon_j]=\sigma_{\vert j-i\vert}^2>0$ if $\vert j-i\vert \leq m$ and $\Sigma_{i,j}=\Cov[\epsilon_i,\epsilon_j]=0$ else. $\sigma^2=(\sigma_0^2,\ldots,\sigma_m^2)$ is called the \textit{vector of covariances}. For this family the multiscale regression estimator $\hat{\gamma}$ is defined as the restricted least squares estimator, i.e. \begin{equation}\label{eq:mdpendentpslikelihood} L^\mathcal{P}(y, \gamma) := \sum_{i=1}^{n}{\big(y_i - \gamma(x_i)\big)^2}. \end{equation} The local test statistics are the statistics of the partial sum test \begin{equation}\label{eq:mdependentpslocaltests} T_I^{\mathcal{P}}(y, \gamma_I) := \frac{\left(\sum_{i:\ x_i\in I}{y_i-\gamma_I}\right)^2}{2\Var\big(\sum_{i:\ x_i\in I}{y_i}\big)}, \end{equation} with $\Var\big(\sum_{i:\ x_i\in I}{y_i}\big)=\vert I \vert_0 \sigma_0^2 + 2\sum_{k=1}^{m}{(\vert I \vert_0-k)_+\sigma_k^2}$, $x_+=\max(x,0)$, hence, the bounds are derived by \begin{equation}\label{eq:mdependentpsbounds} [l_I, u_I] := \left[\overline{y}_I-\sqrt{\frac{2\Var\big(\sum_{i:\ x_i\in I}{y_i}\big)q_{\vert I \vert_0}}{\vert I \vert_0}},\overline{y}_I+\sqrt{\frac{2\Var\big(\sum_{i:\ x_i\in I}{y_i}\big)q_{\vert I \vert_0}}{\vert I \vert_0}}\right], \end{equation} with $\overline{y}_I:=\sum_{i:x_i\in I}{y_i}/\vert I \vert_0$. All lengths $\{1,\ldots,n\}$ are allowed for this family. \section{Interval systems}\label{sec:intervalsystem} This package supports the following interval systems. \paragraph{"all"} The system of all intervals (with start and end points at the observation grid). More precisely, the set \begin{equation}\label{eq:intervalsystemall} \{[x_i, x_j],\ 1\leq i \leq j \leq n\}. \end{equation} This system allows all lengths $1,\ldots,n$. \paragraph{"dyaLen"} The system of all intervals of dyadic length. More precisely, the set \begin{equation}\label{eq:intervalsystemdyalen} \{[x_i, x_j],\ 1\leq i \leq j \leq n \text{ s.t. } \exists\ k\in\N_0:\ j-i+1=2^k\}. \end{equation} This system allows all dyadic lengths $2^0,\ldots,2^{\lfloor \log_2(n) \rfloor}$. \paragraph{"dyaPar"} The system of the dyadic partition, i.e. all disjoint intervals of dyadic length. More precisely, the set \begin{equation}\label{eq:intervalsystemdyapar} \{[x_{(i-1)2^k + 1}, x_{i 2^k}],\ i = 1,\ldots, \lfloor n / 2^k\rfloor,\ k = 0, \ldots, \lfloor\log_2(n)\rfloor\}. \end{equation} This system allows all dyadic lengths $2^0,\ldots,2^{\lfloor \log_2(n) \rfloor}$. \bibliographystyle{apalike} %acm %\bibliographystyle{acm} %\bibliographystyle{IEEEtran} \bibliography{references} \end{document}