Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
14 / 14 |
|
100.00% |
4 / 4 |
CRAP | |
100.00% |
1 / 1 |
UserSplitterInstrumentation | |
100.00% |
14 / 14 |
|
100.00% |
4 / 4 |
6 | |
100.00% |
1 / 1 |
getUserHash | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
isSampled | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
getBucket | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
scaledHash | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\MetricsPlatform\UserSplitter; |
4 | |
5 | use Wikimedia\Assert\Assert; |
6 | |
7 | /** |
8 | * Deterministic sample and bucketing based on user IDs. |
9 | * |
10 | * The caller takes care of turning a user ID into a deterministic hash with |
11 | * uniform probability distribution (see UserHashGenerate). |
12 | * |
13 | * Given an example user that is assigned 0.421 and 3 buckets (A, B, C), it works as follows: |
14 | * |
15 | * - The assigned float is scaled to cover the three buckets, in #scaledHash(). |
16 | * 0.421 * 3 = 1.263 |
17 | * |
18 | * - Each whole number represents a bucket. This case we're in bucket B. |
19 | * A = 0.x, B = 1.x, C = 2.x |
20 | * |
21 | * - The fraction within each number represents the sample, so if our sample ratio |
22 | * is 0.5, than x.00 to x.50 would be sampled, and x.50 to x.99 would be unsampled. |
23 | * In this case we're 1.263 which is sampled, and in bucket B. |
24 | * |
25 | * @license GPL-2.0-or-later |
26 | * @internal |
27 | */ |
28 | class UserSplitterInstrumentation { |
29 | |
30 | /** |
31 | * Get hash of a user ID as a float between 0.0 (inclusive) and 1.0 (non-inclusive) |
32 | * concatenated with an experiment name. |
33 | * |
34 | * @param int $userId |
35 | * @param string $experimentName |
36 | * @return float |
37 | */ |
38 | public function getUserHash( int $userId, string $experimentName ): float { |
39 | $userIdExperimentName = $userId . $experimentName; |
40 | return intval( substr( md5( $userIdExperimentName ), 0, 6 ), 16 ) / ( 0xffffff + 1 ); |
41 | } |
42 | |
43 | /** |
44 | * Whether given user is in the sample. |
45 | * |
46 | * Should be called before getBucket(). |
47 | * |
48 | * @param float $sampleRatio |
49 | * @param array $buckets |
50 | * @param float $userHash |
51 | * @return bool True if sampled, false if unsampled. |
52 | */ |
53 | public function isSampled( float $sampleRatio, array $buckets, float $userHash ): bool { |
54 | Assert::parameter( |
55 | $sampleRatio >= 0 && $sampleRatio <= 1, |
56 | 'sampleRatio', |
57 | 'Sample ratio must be in range [0, 1]' |
58 | ); |
59 | |
60 | // Take the right of the decimal. |
61 | $sample = fmod( $this->scaledHash( $userHash, $buckets ), 1 ); |
62 | return $sample < $sampleRatio; |
63 | } |
64 | |
65 | /** |
66 | * Which bucket a given user is in. |
67 | * |
68 | * This does NOT imply sample and should usually be called after isSampled(). |
69 | * |
70 | * @param array $buckets |
71 | * @param float $userHash |
72 | * @return mixed|null Bucket name or null if buckets are unused. |
73 | */ |
74 | public function getBucket( array $buckets, float $userHash ) { |
75 | if ( $buckets === [] ) { |
76 | return null; |
77 | } |
78 | |
79 | // Get the bucket index (int is akin to floor/truncate, but as int instead of float) |
80 | $index = (int)$this->scaledHash( $userHash, $buckets ); |
81 | |
82 | return $buckets[ $index ]; |
83 | } |
84 | |
85 | /** |
86 | * @param float $userHash |
87 | * @param array $buckets |
88 | * @return float Integer component is the bucket index (from 0 to count-1), fractional component is the sample rate. |
89 | */ |
90 | private function scaledHash( float $userHash, array $buckets ): float { |
91 | return $userHash * max( 1, count( $buckets ) ); |
92 | } |
93 | |
94 | } |