1 /* 2 * Copyright 2003-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.math.stat.regression; 18 import java.io.Serializable; 19 20 import org.apache.commons.math.MathException; 21 import org.apache.commons.math.distribution.DistributionFactory; 22 import org.apache.commons.math.distribution.TDistribution; 23 24 /** 25 * Estimates an ordinary least squares regression model 26 * with one independent variable. 27 * <p> 28 * <code> y = intercept + slope * x </code> 29 * <p> 30 * Standard errors for <code>intercept</code> and <code>slope</code> are 31 * available as well as ANOVA, r-square and Pearson's r statistics. 32 * <p> 33 * Observations (x,y pairs) can be added to the model one at a time or they 34 * can be provided in a 2-dimensional array. The observations are not stored 35 * in memory, so there is no limit to the number of observations that can be 36 * added to the model. 37 * <p> 38 * <strong>Usage Notes</strong>: <ul> 39 * <li> When there are fewer than two observations in the model, or when 40 * there is no variation in the x values (i.e. all x values are the same) 41 * all statistics return <code>NaN</code>. At least two observations with 42 * different x coordinates are requred to estimate a bivariate regression 43 * model. 44 * </li> 45 * <li> getters for the statistics always compute values based on the current 46 * set of observations -- i.e., you can get statistics, then add more data 47 * and get updated statistics without using a new instance. There is no 48 * "compute" method that updates all statistics. Each of the getters performs 49 * the necessary computations to return the requested statistic.</li> 50 * </ul> 51 * 52 * @version $Revision: 348519 $ $Date: 2005-11-23 12:12:18 -0700 (Wed, 23 Nov 2005) $ 53 */ 54 public class SimpleRegression implements Serializable { 55 56 /** Serializable version identifier */ 57 private static final long serialVersionUID = -3004689053607543335L; 58 59 /** sum of x values */ 60 private double sumX = 0d; 61 62 /** total variation in x (sum of squared deviations from xbar) */ 63 private double sumXX = 0d; 64 65 /** sum of y values */ 66 private double sumY = 0d; 67 68 /** total variation in y (sum of squared deviations from ybar) */ 69 private double sumYY = 0d; 70 71 /** sum of products */ 72 private double sumXY = 0d; 73 74 /** number of observations */ 75 private long n = 0; 76 77 /** mean of accumulated x values, used in updating formulas */ 78 private double xbar = 0; 79 80 /** mean of accumulated y values, used in updating formulas */ 81 private double ybar = 0; 82 83 // ---------------------Public methods-------------------------------------- 84 85 /** 86 * Create an empty SimpleRegression instance 87 */ 88 public SimpleRegression() { 89 super(); 90 } 91 92 /** 93 * Adds the observation (x,y) to the regression data set. 94 * <p> 95 * Uses updating formulas for means and sums of squares defined in 96 * "Algorithms for Computing the Sample Variance: Analysis and 97 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 98 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in 99 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985 100 * 101 * 102 * @param x independent variable value 103 * @param y dependent variable value 104 */ 105 public void addData(double x, double y) { 106 if (n == 0) { 107 xbar = x; 108 ybar = y; 109 } else { 110 double dx = x - xbar; 111 double dy = y - ybar; 112 sumXX += dx * dx * (double) n / (double) (n + 1.0); 113 sumYY += dy * dy * (double) n / (double) (n + 1.0); 114 sumXY += dx * dy * (double) n / (double) (n + 1.0); 115 xbar += dx / (double) (n + 1.0); 116 ybar += dy / (double) (n + 1.0); 117 } 118 sumX += x; 119 sumY += y; 120 n++; 121 } 122 123 /** 124 * Adds the observations represented by the elements in 125 * <code>data</code>. 126 * <p> 127 * <code>(data[0][0],data[0][1])</code> will be the first observation, then 128 * <code>(data[1][0],data[1][1])</code>, etc. 129 * <p> 130 * This method does not replace data that has already been added. The 131 * observations represented by <code>data</code> are added to the existing 132 * dataset. 133 * <p> 134 * To replace all data, use <code>clear()</code> before adding the new 135 * data. 136 * 137 * @param data array of observations to be added 138 */ 139 public void addData(double[][] data) { 140 for (int i = 0; i < data.length; i++) { 141 addData(data[i][0], data[i][1]); 142 } 143 } 144 145 /** 146 * Clears all data from the model. 147 */ 148 public void clear() { 149 sumX = 0d; 150 sumXX = 0d; 151 sumY = 0d; 152 sumYY = 0d; 153 sumXY = 0d; 154 n = 0; 155 } 156 157 /** 158 * Returns the number of observations that have been added to the model. 159 * 160 * @return n number of observations that have been added. 161 */ 162 public long getN() { 163 return n; 164 } 165 166 /** 167 * Returns the "predicted" <code>y</code> value associated with the 168 * supplied <code>x</code> value, based on the data that has been 169 * added to the model when this method is activated. 170 * <p> 171 * <code> predict(x) = intercept + slope * x </code> 172 * <p> 173 * <strong>Preconditions</strong>: <ul> 174 * <li>At least two observations (with at least two different x values) 175 * must have been added before invoking this method. If this method is 176 * invoked before a model can be estimated, <code>Double,NaN</code> is 177 * returned. 178 * </li></ul> 179 * 180 * @param x input <code>x</code> value 181 * @return predicted <code>y</code> value 182 */ 183 public double predict(double x) { 184 double b1 = getSlope(); 185 return getIntercept(b1) + b1 * x; 186 } 187 188 /** 189 * Returns the intercept of the estimated regression line. 190 * <p> 191 * The least squares estimate of the intercept is computed using the 192 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 193 * The intercept is sometimes denoted b0. 194 * <p> 195 * <strong>Preconditions</strong>: <ul> 196 * <li>At least two observations (with at least two different x values) 197 * must have been added before invoking this method. If this method is 198 * invoked before a model can be estimated, <code>Double,NaN</code> is 199 * returned. 200 * </li></ul> 201 * 202 * @return the intercept of the regression line 203 */ 204 public double getIntercept() { 205 return getIntercept(getSlope()); 206 } 207 208 /** 209 * Returns the slope of the estimated regression line. 210 * <p> 211 * The least squares estimate of the slope is computed using the 212 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 213 * The slope is sometimes denoted b1. 214 * <p> 215 * <strong>Preconditions</strong>: <ul> 216 * <li>At least two observations (with at least two different x values) 217 * must have been added before invoking this method. If this method is 218 * invoked before a model can be estimated, <code>Double.NaN</code> is 219 * returned. 220 * </li></ul> 221 * 222 * @return the slope of the regression line 223 */ 224 public double getSlope() { 225 if (n < 2) { 226 return Double.NaN; //not enough data 227 } 228 if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) { 229 return Double.NaN; //not enough variation in x 230 } 231 return sumXY / sumXX; 232 } 233 234 /** 235 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> 236 * sum of squared errors</a> (SSE) associated with the regression 237 * model. 238 * <p> 239 * <strong>Preconditions</strong>: <ul> 240 * <li>At least two observations (with at least two different x values) 241 * must have been added before invoking this method. If this method is 242 * invoked before a model can be estimated, <code>Double,NaN</code> is 243 * returned. 244 * </li></ul> 245 * 246 * @return sum of squared errors associated with the regression model 247 */ 248 public double getSumSquaredErrors() { 249 return getSumSquaredErrors(getSlope()); 250 } 251 252 /** 253 * Returns the sum of squared deviations of the y values about their mean. 254 * <p> 255 * This is defined as SSTO 256 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>. 257 * <p> 258 * If <code>n < 2</code>, this returns <code>Double.NaN</code>. 259 * 260 * @return sum of squared deviations of y values 261 */ 262 public double getTotalSumSquares() { 263 if (n < 2) { 264 return Double.NaN; 265 } 266 return sumYY; 267 } 268 269 /** 270 * Returns the sum of squared deviations of the predicted y values about 271 * their mean (which equals the mean of y). 272 * <p> 273 * This is usually abbreviated SSR or SSM. It is defined as SSM 274 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a> 275 * <p> 276 * <strong>Preconditions</strong>: <ul> 277 * <li>At least two observations (with at least two different x values) 278 * must have been added before invoking this method. If this method is 279 * invoked before a model can be estimated, <code>Double.NaN</code> is 280 * returned. 281 * </li></ul> 282 * 283 * @return sum of squared deviations of predicted y values 284 */ 285 public double getRegressionSumSquares() { 286 return getRegressionSumSquares(getSlope()); 287 } 288 289 /** 290 * Returns the sum of squared errors divided by the degrees of freedom, 291 * usually abbreviated MSE. 292 * <p> 293 * If there are fewer than <strong>three</strong> data pairs in the model, 294 * or if there is no variation in <code>x</code>, this returns 295 * <code>Double.NaN</code>. 296 * 297 * @return sum of squared deviations of y values 298 */ 299 public double getMeanSquareError() { 300 if (n < 3) { 301 return Double.NaN; 302 } 303 return getSumSquaredErrors() / (double) (n - 2); 304 } 305 306 /** 307 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> 308 * Pearson's product moment correlation coefficient</a>, 309 * usually denoted r. 310 * <p> 311 * <strong>Preconditions</strong>: <ul> 312 * <li>At least two observations (with at least two different x values) 313 * must have been added before invoking this method. If this method is 314 * invoked before a model can be estimated, <code>Double,NaN</code> is 315 * returned. 316 * </li></ul> 317 * 318 * @return Pearson's r 319 */ 320 public double getR() { 321 double b1 = getSlope(); 322 double result = Math.sqrt(getRSquare(b1)); 323 if (b1 < 0) { 324 result = -result; 325 } 326 return result; 327 } 328 329 /** 330 * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 331 * coefficient of determination</a>, 332 * usually denoted r-square. 333 * <p> 334 * <strong>Preconditions</strong>: <ul> 335 * <li>At least two observations (with at least two different x values) 336 * must have been added before invoking this method. If this method is 337 * invoked before a model can be estimated, <code>Double,NaN</code> is 338 * returned. 339 * </li></ul> 340 * 341 * @return r-square 342 */ 343 public double getRSquare() { 344 return getRSquare(getSlope()); 345 } 346 347 /** 348 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> 349 * standard error of the intercept estimate</a>, 350 * usually denoted s(b0). 351 * <p> 352 * If there are fewer that <strong>three</strong> observations in the 353 * model, or if there is no variation in x, this returns 354 * <code>Double.NaN</code>. 355 * 356 * @return standard error associated with intercept estimate 357 */ 358 public double getInterceptStdErr() { 359 return Math.sqrt( 360 getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX)); 361 } 362 363 /** 364 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard 365 * error of the slope estimate</a>, 366 * usually denoted s(b1). 367 * <p> 368 * If there are fewer that <strong>three</strong> data pairs in the model, 369 * or if there is no variation in x, this returns <code>Double.NaN</code>. 370 * 371 * @return standard error associated with slope estimate 372 */ 373 public double getSlopeStdErr() { 374 return Math.sqrt(getMeanSquareError() / sumXX); 375 } 376 377 /** 378 * Returns the half-width of a 95% confidence interval for the slope 379 * estimate. 380 * <p> 381 * The 95% confidence interval is 382 * <p> 383 * <code>(getSlope() - getSlopeConfidenceInterval(), 384 * getSlope() + getSlopeConfidenceInterval())</code> 385 * <p> 386 * If there are fewer that <strong>three</strong> observations in the 387 * model, or if there is no variation in x, this returns 388 * <code>Double.NaN</code>. 389 * <p> 390 * <strong>Usage Note</strong>:<br> 391 * The validity of this statistic depends on the assumption that the 392 * observations included in the model are drawn from a 393 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 394 * Bivariate Normal Distribution</a>. 395 * 396 * @return half-width of 95% confidence interval for the slope estimate 397 * 398 * @throws MathException if the confidence interval can not be computed. 399 */ 400 public double getSlopeConfidenceInterval() throws MathException { 401 return getSlopeConfidenceInterval(0.05d); 402 } 403 404 /** 405 * Returns the half-width of a (100-100*alpha)% confidence interval for 406 * the slope estimate. 407 * <p> 408 * The (100-100*alpha)% confidence interval is 409 * <p> 410 * <code>(getSlope() - getSlopeConfidenceInterval(), 411 * getSlope() + getSlopeConfidenceInterval())</code> 412 * <p> 413 * To request, for example, a 99% confidence interval, use 414 * <code>alpha = .01</code> 415 * <p> 416 * <strong>Usage Note</strong>:<br> 417 * The validity of this statistic depends on the assumption that the 418 * observations included in the model are drawn from a 419 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 420 * Bivariate Normal Distribution</a>. 421 * <p> 422 * <strong> Preconditions:</strong><ul> 423 * <li>If there are fewer that <strong>three</strong> observations in the 424 * model, or if there is no variation in x, this returns 425 * <code>Double.NaN</code>. 426 * </li> 427 * <li><code>(0 < alpha < 1)</code>; otherwise an 428 * <code>IllegalArgumentException</code> is thrown. 429 * </li></ul> 430 * 431 * @param alpha the desired significance level 432 * @return half-width of 95% confidence interval for the slope estimate 433 * @throws MathException if the confidence interval can not be computed. 434 */ 435 public double getSlopeConfidenceInterval(double alpha) 436 throws MathException { 437 if (alpha >= 1 || alpha <= 0) { 438 throw new IllegalArgumentException(); 439 } 440 return getSlopeStdErr() * 441 getTDistribution().inverseCumulativeProbability(1d - alpha / 2d); 442 } 443 444 /** 445 * Returns the significance level of the slope (equiv) correlation. 446 * <p> 447 * Specifically, the returned value is the smallest <code>alpha</code> 448 * such that the slope confidence interval with significance level 449 * equal to <code>alpha</code> does not include <code>0</code>. 450 * On regression output, this is often denoted <code>Prob(|t| > 0)</code> 451 * <p> 452 * <strong>Usage Note</strong>:<br> 453 * The validity of this statistic depends on the assumption that the 454 * observations included in the model are drawn from a 455 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 456 * Bivariate Normal Distribution</a>. 457 * <p> 458 * If there are fewer that <strong>three</strong> observations in the 459 * model, or if there is no variation in x, this returns 460 * <code>Double.NaN</code>. 461 * 462 * @return significance level for slope/correlation 463 * @throws MathException if the significance level can not be computed. 464 */ 465 public double getSignificance() throws MathException { 466 return 2d* (1.0 - getTDistribution().cumulativeProbability( 467 Math.abs(getSlope()) / getSlopeStdErr())); 468 } 469 470 // ---------------------Private methods----------------------------------- 471 472 /** 473 * Returns the intercept of the estimated regression line, given the slope. 474 * <p> 475 * Will return <code>NaN</code> if slope is <code>NaN</code>. 476 * 477 * @param slope current slope 478 * @return the intercept of the regression line 479 */ 480 private double getIntercept(double slope) { 481 return (sumY - slope * sumX) / ((double) n); 482 } 483 484 /** 485 * Returns the sum of squared errors associated with the regression 486 * model, using the slope of the regression line. 487 * <p> 488 * Returns NaN if the slope is NaN. 489 * 490 * @param b1 current slope 491 * @return sum of squared errors associated with the regression model 492 */ 493 private double getSumSquaredErrors(double b1) { 494 return sumYY - sumXY * sumXY / sumXX; 495 } 496 497 /** 498 * Computes r-square from the slope. 499 * <p> 500 * will return NaN if slope is Nan. 501 * 502 * @param b1 current slope 503 * @return r-square 504 */ 505 private double getRSquare(double b1) { 506 double ssto = getTotalSumSquares(); 507 return (ssto - getSumSquaredErrors(b1)) / ssto; 508 } 509 510 /** 511 * Computes SSR from b1. 512 * 513 * @param slope regression slope estimate 514 * @return sum of squared deviations of predicted y values 515 */ 516 private double getRegressionSumSquares(double slope) { 517 return slope * slope * sumXX; 518 } 519 520 /** 521 * Uses distribution framework to get a t distribution instance 522 * with df = n - 2 523 * 524 * @return t distribution with df = n - 2 525 */ 526 private TDistribution getTDistribution() { 527 return DistributionFactory.newInstance().createTDistribution(n - 2); 528 } 529 }