Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
85.71% |
36 / 42 |
|
37.50% |
3 / 8 |
CRAP | |
0.00% |
0 / 1 |
| StatsUtils | |
85.71% |
36 / 42 |
|
37.50% |
3 / 8 |
23.41 | |
0.00% |
0 / 1 |
| validateNewSampleRate | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
3 | |||
| getFilteredSamples | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
| validateMetricName | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
| validateLabelKey | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
| validateLabelValue | |
50.00% |
1 / 2 |
|
0.00% |
0 / 1 |
2.50 | |||
| normalizeArray | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| normalizeString | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| makeBucketsFromMean | |
87.50% |
14 / 16 |
|
0.00% |
0 / 1 |
4.03 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | * @file |
| 5 | */ |
| 6 | |
| 7 | declare( strict_types=1 ); |
| 8 | |
| 9 | namespace Wikimedia\Stats; |
| 10 | |
| 11 | use InvalidArgumentException; |
| 12 | use Wikimedia\Stats\Exceptions\InvalidConfigurationException; |
| 13 | |
| 14 | /** |
| 15 | * Functionality common to all metric types. |
| 16 | * |
| 17 | * @author Cole White |
| 18 | * @since 1.38 |
| 19 | */ |
| 20 | class StatsUtils { |
| 21 | |
| 22 | public const RE_VALID_NAME_AND_LABEL_NAME = "/^[a-zA-Z_][a-zA-Z0-9_]*$/"; |
| 23 | public const DEFAULT_SAMPLE_RATE = 1.0; |
| 24 | |
| 25 | /** |
| 26 | * Validates the new sample rate. Throws InvalidArgumentException if provided an invalid rate. |
| 27 | * |
| 28 | * @param float $newSampleRate |
| 29 | * @throws InvalidArgumentException |
| 30 | */ |
| 31 | public static function validateNewSampleRate( float $newSampleRate ): void { |
| 32 | if ( $newSampleRate < 0.0 || $newSampleRate > 1.0 ) { |
| 33 | throw new InvalidArgumentException( "Sample rate can only be between 0.0 and 1.0. Got: " . $newSampleRate ); |
| 34 | } |
| 35 | } |
| 36 | |
| 37 | /** |
| 38 | * Returns a subset of samples based on configured sample rate. |
| 39 | * |
| 40 | * @param float $sampleRate |
| 41 | * @param array $samples |
| 42 | * @return array |
| 43 | */ |
| 44 | public static function getFilteredSamples( float $sampleRate, array $samples ): array { |
| 45 | if ( $sampleRate === 1.0 ) { |
| 46 | return $samples; |
| 47 | } |
| 48 | $output = []; |
| 49 | $randMax = mt_getrandmax(); |
| 50 | foreach ( $samples as $sample ) { |
| 51 | if ( mt_rand() / $randMax < $sampleRate ) { |
| 52 | $output[] = $sample; |
| 53 | } |
| 54 | } |
| 55 | return $output; |
| 56 | } |
| 57 | |
| 58 | /** |
| 59 | * Determines if provided string is a valid name. |
| 60 | * |
| 61 | * @param string $name |
| 62 | * @return void |
| 63 | * @throws InvalidArgumentException |
| 64 | * @throws InvalidConfigurationException |
| 65 | */ |
| 66 | public static function validateMetricName( string $name ) { |
| 67 | if ( $name === "" ) { |
| 68 | throw new InvalidArgumentException( "Stats: Metric name cannot be empty." ); |
| 69 | } |
| 70 | if ( !preg_match( self::RE_VALID_NAME_AND_LABEL_NAME, $name ) ) { |
| 71 | throw new InvalidConfigurationException( "Invalid metric name: '" . $name . "'" ); |
| 72 | } |
| 73 | } |
| 74 | |
| 75 | /** |
| 76 | * Determines if provided string is a valid label key. |
| 77 | * |
| 78 | * @param string $key |
| 79 | * @return void |
| 80 | * @throws InvalidArgumentException |
| 81 | * @throws InvalidConfigurationException |
| 82 | */ |
| 83 | public static function validateLabelKey( string $key ) { |
| 84 | if ( $key === "" ) { |
| 85 | throw new InvalidArgumentException( "Stats: Label key cannot be empty." ); |
| 86 | } |
| 87 | if ( !preg_match( self::RE_VALID_NAME_AND_LABEL_NAME, $key ) ) { |
| 88 | throw new InvalidConfigurationException( "Invalid label key: '" . $key . "'" ); |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | public static function validateLabelValue( string $value ) { |
| 93 | if ( $value === "" ) { |
| 94 | throw new InvalidArgumentException( "Stats: Label value cannot be empty." ); |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | /** |
| 99 | * Normalize an array of strings. |
| 100 | * |
| 101 | * @param string[] $entities |
| 102 | * @return string[] |
| 103 | */ |
| 104 | public static function normalizeArray( array $entities ): array { |
| 105 | $normalizedEntities = []; |
| 106 | foreach ( $entities as $entity ) { |
| 107 | $normalizedEntities[] = self::normalizeString( $entity ); |
| 108 | } |
| 109 | return $normalizedEntities; |
| 110 | } |
| 111 | |
| 112 | /** |
| 113 | * Normalize strings to a metrics-compatible format. |
| 114 | * |
| 115 | * Replace all other non-alphanumeric characters with an underscore. |
| 116 | * Trim leading or trailing underscores. |
| 117 | * |
| 118 | * Note: We are not using /i (case-insensitive flag) |
| 119 | * or \d (digit character class escape) here because |
| 120 | * their behavior changes with respect to locale settings. |
| 121 | * |
| 122 | * @param string $entity |
| 123 | * @return string |
| 124 | */ |
| 125 | public static function normalizeString( string $entity ): string { |
| 126 | $entity = preg_replace( '/[^a-zA-Z0-9]+/', '_', $entity ); |
| 127 | return trim( $entity, '_' ); |
| 128 | } |
| 129 | |
| 130 | /** |
| 131 | * The E12 series |
| 132 | * @see https://en.wikipedia.org/wiki/E_series_of_preferred_numbers |
| 133 | */ |
| 134 | private const E12 = [ |
| 135 | 1.0, 1.2, 1.5, 1.8, 2.2, 2.7, 3.3, 3.9, 4.7, 5.6, 6.8, 8.2, 10.0 |
| 136 | ]; |
| 137 | |
| 138 | /** |
| 139 | * Make a set of HistogramMetric buckets from a mean and skip value. |
| 140 | * |
| 141 | * Beware: this is for storing non-time data in histograms, like byte |
| 142 | * sizes, or time data outside of the range [5ms, 60s]. |
| 143 | * |
| 144 | * Avoid changing the buckets once a metric has been deployed, |
| 145 | * as it may generate excessive churn. |
| 146 | * |
| 147 | * That said, this method quantizes the mean so modest shifts should |
| 148 | * maintain most buckets, and multiplying or dividing the "skip" |
| 149 | * by a small factor should also maintain commonality. |
| 150 | * |
| 151 | * The range of buckets for typical skips is roughly: |
| 152 | * |
| 153 | * $skip = 1: [0.5*mean, 2*mean] |
| 154 | * $skip = 2: [0.2*mean, 5*mean] |
| 155 | * $skip = 3: [0.1*mean, 10*mean] |
| 156 | * $skip = 4: [0.05*mean, 20*mean] |
| 157 | * $skip = 5: [0.02*mean, 50*mean] |
| 158 | * $skip = 6: [0.01*mean, 100*mean] |
| 159 | * ... |
| 160 | * $skip = 12: [0.001*mean, 10000*mean] |
| 161 | * |
| 162 | * @param float $mean The mean value expected. |
| 163 | * @param int $skip The range of values expected. With $skip = 1, |
| 164 | * each bucket will be greater than the last by a factor of 10^(1/12), |
| 165 | * which means 12 buckets per decade of range. This is the E12 series. |
| 166 | * With $skip = 2 we take every other bucket (6 buckets per decade), |
| 167 | * $skip = 3 means every third bucket (4 buckets per decade), |
| 168 | * $skip = 4 means every fourth bucket (3 buckets per decade), etc. |
| 169 | * The ::getTiming() metric effectively uses $skip = 4, which |
| 170 | * corresponds roughly to the usual `[0.1, 0.2, 0.5, 1]` |
| 171 | * progression, and that is the default value. As mentioned |
| 172 | * above, take great care when changing $skip on metrics already |
| 173 | * in production. |
| 174 | * @return float[] An array of 9 buckets, centered around the mean |
| 175 | */ |
| 176 | public static function makeBucketsFromMean( float $mean, int $skip ): array { |
| 177 | // assert $mean > 0 and $skip > 0 |
| 178 | if ( $mean <= 0 ) { |
| 179 | throw new InvalidArgumentException( 'mean must be positive' ); |
| 180 | } |
| 181 | if ( $skip < 1 ) { |
| 182 | throw new InvalidArgumentException( 'skip must be at least 1' ); |
| 183 | } |
| 184 | // Find the appropriate starting location in the E12 series. |
| 185 | $pos = (int)round( log10( $mean ) * 12 ); |
| 186 | // Further quantize $pos according to $skip, so changes in $mean |
| 187 | // don't shift all the buckets |
| 188 | $pos -= ( $pos % $skip ); |
| 189 | // Compute buckets around the quantized starting position |
| 190 | // By using the E12 series and powers of ten our cutoffs will |
| 191 | // be compact (not too many digits) and consistent. |
| 192 | return array_map( static function ( $x ) use ( $pos, $skip ) { |
| 193 | $y = $pos + ( $x * $skip ); |
| 194 | $rem = $y % 12; |
| 195 | if ( $rem < 0 ) { |
| 196 | $rem += 12; |
| 197 | } |
| 198 | $decade = intdiv( $y - $rem, 12 ); // floor($y/12) |
| 199 | // Use an explicit round() here to ensure float math doesn't create |
| 200 | // extra tiny variances. |
| 201 | return round( ( 10 ** $decade ) * self::E12[$rem], 1 - $decade ); |
| 202 | }, [ |
| 203 | // 9 buckets, centered around the (quantized) mean |
| 204 | -4, -3, -2, -1, 0, 1, 2, 3, 4 |
| 205 | ] ); |
| 206 | } |
| 207 | } |