Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
34 / 34 |
|
100.00% |
6 / 6 |
CRAP | |
100.00% |
1 / 1 |
RunningStat | |
100.00% |
34 / 34 |
|
100.00% |
6 / 6 |
10 | |
100.00% |
1 / 1 |
getCount | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
addObservation | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
getMean | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getVariance | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
getStdDev | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
merge | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | * @author Ori Livneh <ori@wikimedia.org> |
20 | */ |
21 | |
22 | namespace Wikimedia; |
23 | |
24 | /** |
25 | * Compute running mean, variance, and extrema of a stream of numbers. |
26 | * |
27 | * RunningStat instances are accumulator-like objects that provide a set of |
28 | * continuously-updated summary statistics for a stream of numbers, without |
29 | * requiring that each value be stored. The measures it provides are the |
30 | * arithmetic mean, variance, standard deviation, and extrema (min and max); |
31 | * together they describe the central tendency and statistical dispersion of a |
32 | * set of values. |
33 | * |
34 | * One RunningStat instance can be merged into another; the resultant |
35 | * RunningStat has the state it would have had if it had accumulated each |
36 | * individual point. This allows data to be summarized in parallel and in |
37 | * stages without loss of fidelity. |
38 | * |
39 | * Based on a C++ implementation by John D. Cook: |
40 | * |
41 | * - <http://www.johndcook.com/standard_deviation.html> |
42 | * - <http://www.johndcook.com/skewness_kurtosis.html> |
43 | * |
44 | * The in-line documentation for this class incorporates content from the |
45 | * English Wikipedia articles "Variance", "Algorithms for calculating |
46 | * variance", and "Standard deviation". |
47 | */ |
48 | class RunningStat { |
49 | /** Number of samples. */ |
50 | public $n = 0; |
51 | /** The first moment (or mean, or expected value). */ |
52 | public $m1 = 0.0; |
53 | /** The second central moment (or variance). */ |
54 | public $m2 = 0.0; |
55 | /** The least value in the set. */ |
56 | public $min = INF; |
57 | /** The greatest value in the set. */ |
58 | public $max = -INF; |
59 | |
60 | /** |
61 | * Count the number of accumulated values. |
62 | * |
63 | * @return int |
64 | */ |
65 | public function getCount() { |
66 | return $this->n; |
67 | } |
68 | |
69 | /** |
70 | * Add a number to the data set. |
71 | * |
72 | * @param int|float $x Value to add |
73 | */ |
74 | public function addObservation( $x ) { |
75 | $x = (float)$x; |
76 | |
77 | $this->min = min( $this->min, $x ); |
78 | $this->max = max( $this->max, $x ); |
79 | |
80 | $n1 = $this->n; |
81 | $this->n++; |
82 | $delta = $x - $this->m1; |
83 | $delta_n = $delta / $this->n; |
84 | $this->m1 += $delta_n; |
85 | $this->m2 += $delta * $delta_n * $n1; |
86 | } |
87 | |
88 | /** |
89 | * Get the mean, or expected value. |
90 | * |
91 | * The arithmetic mean is the sum of all measurements divided by the number |
92 | * of observations in the data set. |
93 | * |
94 | * @return float |
95 | */ |
96 | public function getMean() { |
97 | return $this->m1; |
98 | } |
99 | |
100 | /** |
101 | * Get the estimated variance. |
102 | * |
103 | * Variance measures how far a set of numbers is spread out. A small |
104 | * variance indicates that the data points tend to be very close to the |
105 | * mean (and hence to each other), while a high variance indicates that the |
106 | * data points are very spread out from the mean and from each other. |
107 | * |
108 | * @return float Estimated variance |
109 | */ |
110 | public function getVariance() { |
111 | if ( $this->n === 0 ) { |
112 | // The variance of the empty set is undefined. |
113 | return NAN; |
114 | } elseif ( $this->n === 1 ) { |
115 | return 0.0; |
116 | } else { |
117 | return $this->m2 / ( $this->n - 1.0 ); |
118 | } |
119 | } |
120 | |
121 | /** |
122 | * Get the estimated standard deviation. |
123 | * |
124 | * The standard deviation of a statistical population is the square root of |
125 | * its variance. It shows how much variation from the mean exists. In |
126 | * addition to expressing the variability of a population, the standard |
127 | * deviation is commonly used to measure confidence in statistical conclusions. |
128 | * |
129 | * @return float Estimated standard deviation |
130 | */ |
131 | public function getStdDev() { |
132 | return sqrt( $this->getVariance() ); |
133 | } |
134 | |
135 | /** |
136 | * Merge another RunningStat instance into this instance. |
137 | * |
138 | * This instance then has the state it would have had if all the data had |
139 | * been accumulated by it alone. |
140 | * |
141 | * @param RunningStat $other RunningStat instance to merge into this one |
142 | */ |
143 | public function merge( RunningStat $other ) { |
144 | // If the other RunningStat is empty, there's nothing to do. |
145 | if ( $other->n === 0 ) { |
146 | return; |
147 | } |
148 | |
149 | // If this RunningStat is empty, copy values from other RunningStat. |
150 | if ( $this->n === 0 ) { |
151 | $this->n = $other->n; |
152 | $this->m1 = $other->m1; |
153 | $this->m2 = $other->m2; |
154 | $this->min = $other->min; |
155 | $this->max = $other->max; |
156 | return; |
157 | } |
158 | |
159 | $n = $this->n + $other->n; |
160 | $delta = $other->m1 - $this->m1; |
161 | $delta2 = $delta * $delta; |
162 | |
163 | $this->m1 = ( ( $this->n * $this->m1 ) + ( $other->n * $other->m1 ) ) / $n; |
164 | $this->m2 = $this->m2 + $other->m2 + ( $delta2 * $this->n * $other->n / $n ); |
165 | $this->min = min( $this->min, $other->min ); |
166 | $this->max = max( $this->max, $other->max ); |
167 | $this->n = $n; |
168 | } |
169 | } |