Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
83.87% |
52 / 62 |
|
77.78% |
7 / 9 |
CRAP | |
0.00% |
0 / 1 |
| EventLoggingLegacyConverter | |
83.87% |
52 / 62 |
|
77.78% |
7 / 9 |
14.82 | |
0.00% |
0 / 1 |
| fromHttpRequest | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
| convertEvent | |
81.82% |
18 / 22 |
|
0.00% |
0 / 1 |
4.10 | |||
| dateTimeString | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| decodeQson | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
| getStreamName | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| isSchemaAllowed | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getSchemaUri | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
| newUUIDv4 | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
| submitEvent | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MediaWiki\Extension\EventLogging\Libs\Legacy; |
| 4 | |
| 5 | use DateTime; |
| 6 | use Exception; |
| 7 | use InvalidArgumentException; |
| 8 | use JsonException; |
| 9 | use MediaWiki\Extension\EventLogging\EventLogging; |
| 10 | use RuntimeException; |
| 11 | use UnexpectedValueException; |
| 12 | |
| 13 | // NOTE: As of 2024-07, the only legacy EventLogging schema this needs to support is |
| 14 | // MediaWikiPingback. Details about this can be found at https://phabricator.wikimedia.org/T323828. |
| 15 | // In summary, MediaWikiPingback does not use the EventLogging MediaWiki extension to produce events. |
| 16 | // MediaWikiPingback instrument collects data from 3rd party MediaWiki installs, |
| 17 | // and we cannot force those 3rd parties to upgrade to newer versions of MediaWiki |
| 18 | // that produce events directly to eventgate-analytics-external. |
| 19 | // (See https://gerrit.wikimedia.org/r/c/mediawiki/core/+/938271/ ). |
| 20 | // |
| 21 | // The MediaWikiPingback instrument is configured to send events directly to mediawiki.org, so |
| 22 | // we only need to handle legacy conversion of events from mediawiki.org. |
| 23 | // |
| 24 | // Once we are confident that there are sufficiently few remaining 3rd party MediaWiki installs |
| 25 | // out there that send events using this legacy endpoint, we can remove this endpoint and related |
| 26 | // code (EventLogging extension's EventLoggingLegacyConverter) entirely. |
| 27 | |
| 28 | /** |
| 29 | * Methods to convert legacy EventLogging events into WMF Event Platform compatible ones. |
| 30 | * This class mostly exists to aid in the final decommissioning of the eventlogging python backend |
| 31 | * and associated components and data pipelines |
| 32 | * (varnishkafka, Refine eventlogging_analytics job in analytics hadoop cluster, etc.) |
| 33 | * |
| 34 | * It attempts to replicate some of the logic in eventlogging/parse.py |
| 35 | * https://gerrit.wikimedia.org/r/plugins/gitiles/eventlogging/+/refs/heads/master/eventlogging/parse.py |
| 36 | * and the WMF configured varnishkafka logger. However, because varnishkafka has |
| 37 | * access to data that is not provided by the producer client (e.g. seqId, client IP, etc.), |
| 38 | * This class does not support those kind of features. It does its best to translate |
| 39 | * the client produced legacy event into a WMF Event Platform compatible one. |
| 40 | * |
| 41 | * NOTE: The varnishkafka log format for eventlogging was: |
| 42 | * '%q %l %n %{%FT%T}t %{X-Client-IP}o "%{User-agent}i' |
| 43 | * |
| 44 | * == Differences from original eventlogging/parse.py + format |
| 45 | * |
| 46 | * - seqId %n is not supported. |
| 47 | * |
| 48 | * - recvFrom is populated from REMOTE_HOST or REMOTE_ADDR, instead of the varnish cache hostname %l. |
| 49 | * |
| 50 | * - Receive timestamp is generated here, instead of the cache host request receive timestamp %t. |
| 51 | * |
| 52 | * - Client IP is not supported. |
| 53 | * |
| 54 | * - EventLogging Capsule id field will be set to a random uuid4, |
| 55 | * instead of a uuid5 built from event content. |
| 56 | */ |
| 57 | class EventLoggingLegacyConverter { |
| 58 | |
| 59 | /** |
| 60 | * Maps legacy EventLogging schema names to the migrated WMF Event Platform |
| 61 | * schema version to be used. |
| 62 | * |
| 63 | * A schema must be declared here in order for it to be allowed to be produced, |
| 64 | * otherwise it will be rejected. |
| 65 | * |
| 66 | * @var array|string[] |
| 67 | */ |
| 68 | public static array $schemaVersions = [ |
| 69 | 'MediaWikiPingback' => '1.0.0', |
| 70 | 'Test' => '1.2.0', |
| 71 | ]; |
| 72 | |
| 73 | /** |
| 74 | * Parses and converts a legacy EventLogging 'qson' from the HTTP query params and headers |
| 75 | * to a WMF Event Platform compatible event. |
| 76 | * |
| 77 | * @param array|null $_server If not set, global $_SERVER will be used. |
| 78 | * @return array |
| 79 | * @throws Exception |
| 80 | */ |
| 81 | public static function fromHttpRequest( ?array $_server = null ): array { |
| 82 | $_server ??= $_SERVER; |
| 83 | |
| 84 | $decodedEvent = self::decodeQson( $_server['QUERY_STRING'] ); |
| 85 | return self::convertEvent( |
| 86 | $decodedEvent, |
| 87 | new DateTime(), |
| 88 | $_server['REMOTE_HOST'] ?? $_server['REMOTE_ADDR'] ?? null, |
| 89 | $_server['HTTP_USER_AGENT'] ?? null |
| 90 | ); |
| 91 | } |
| 92 | |
| 93 | /** |
| 94 | * Converts the legacy EventLogging event to a WMF Event Platform compatible one. |
| 95 | * |
| 96 | * @param array $event |
| 97 | * @param DateTime|null $dt |
| 98 | * @param string|null $recvFrom |
| 99 | * @param string|null $userAgent |
| 100 | * @return array |
| 101 | * @throws Exception |
| 102 | */ |
| 103 | public static function convertEvent( |
| 104 | array $event, |
| 105 | ?DateTime $dt = null, |
| 106 | ?string $recvFrom = null, |
| 107 | ?string $userAgent = null |
| 108 | ): array { |
| 109 | if ( !isset( $event['schema'] ) ) { |
| 110 | throw new InvalidArgumentException( |
| 111 | 'Event is missing \'schema\' field. ' . |
| 112 | 'This is required to convert to WMF Event Platform event.' |
| 113 | ); |
| 114 | } |
| 115 | |
| 116 | $event['$schema'] = self::getSchemaUri( $event['schema'] ); |
| 117 | $event['meta'] = [ |
| 118 | 'stream' => self::getStreamName( $event['schema'] ), |
| 119 | ]; |
| 120 | |
| 121 | // NOTE: We do not have a sequence num seqId, so we can't use a url based uuid5 |
| 122 | // eventlogging backend parse.py did. Instead, use a random uuid4. |
| 123 | $event['uuid'] ??= self::newUUIDv4(); |
| 124 | |
| 125 | $dt ??= new DateTime(); |
| 126 | $event['dt'] = self::dateTimeString( $dt ); |
| 127 | // NOTE: `client_dt` is 'legacy' event time. `dt` is the preferred event time field |
| 128 | $event['client_dt'] = $event['dt']; |
| 129 | |
| 130 | if ( $recvFrom !== null ) { |
| 131 | $event['recvFrom'] = $recvFrom; |
| 132 | } |
| 133 | |
| 134 | if ( $userAgent !== null ) { |
| 135 | $event['http'] = [ |
| 136 | 'request_headers' => [ |
| 137 | 'user-agent' => $userAgent |
| 138 | ] |
| 139 | ]; |
| 140 | } |
| 141 | |
| 142 | return $event; |
| 143 | } |
| 144 | |
| 145 | /** |
| 146 | * Returns an ISO-8601 UTC datetime string with 'zulu' timezone notation. |
| 147 | * If $dt is not given, returns for current timestamp. |
| 148 | * |
| 149 | * @param DateTime|null $dt |
| 150 | * @return string |
| 151 | */ |
| 152 | public static function dateTimeString( ?DateTime $dt ): string { |
| 153 | return $dt->format( 'Y-m-d\TH:i:s.' ) . |
| 154 | substr( $dt->format( 'u' ), 0, 3 ) . 'Z'; |
| 155 | } |
| 156 | |
| 157 | /** |
| 158 | * 'qson' is a term found in the legacy eventlogging python codebase. It is URL encoded JSON. |
| 159 | * This parses URL encoded json data into a PHP assoc array. |
| 160 | * @param string $data |
| 161 | * @return array |
| 162 | * @throws JsonException |
| 163 | */ |
| 164 | public static function decodeQson( string $data ): array { |
| 165 | $decoded = rawurldecode( trim( $data, '?&;' ) ); |
| 166 | return json_decode( |
| 167 | $decoded, |
| 168 | true, |
| 169 | 512, |
| 170 | JSON_THROW_ON_ERROR, |
| 171 | ); |
| 172 | } |
| 173 | |
| 174 | /** |
| 175 | * Converts legacy EventLogging schema name to migrated Event Platform stream name. |
| 176 | * @param string $schemaName |
| 177 | * @return string |
| 178 | */ |
| 179 | public static function getStreamName( string $schemaName ): string { |
| 180 | return 'eventlogging_' . $schemaName; |
| 181 | } |
| 182 | |
| 183 | public static function isSchemaAllowed( string $schemaName ): bool { |
| 184 | return array_key_exists( $schemaName, self::$schemaVersions ); |
| 185 | } |
| 186 | |
| 187 | /** |
| 188 | * Converts the EventLogging legacy $schemaName to the migrated WMF |
| 189 | * Event Platform schema URI. This expects that the migrated schema URI is at |
| 190 | * /analytics/legacy/<schemaName>/<version> |
| 191 | * |
| 192 | * @param string $schemaName |
| 193 | * @return string |
| 194 | */ |
| 195 | public static function getSchemaUri( string $schemaName ): string { |
| 196 | if ( !self::isSchemaAllowed( $schemaName ) ) { |
| 197 | throw new UnexpectedValueException( |
| 198 | "$schemaName is not in the list of allowed legacy schemas." |
| 199 | ); |
| 200 | } |
| 201 | |
| 202 | $version = self::$schemaVersions[$schemaName]; |
| 203 | return '/analytics/legacy/' . strtolower( $schemaName ) . '/' . $version; |
| 204 | } |
| 205 | |
| 206 | /** |
| 207 | * Return an RFC4122 compliant v4 UUID |
| 208 | * |
| 209 | * Taken from MediaWiki Wikimedia\UUID\GlobalIdGenerator. |
| 210 | * |
| 211 | * @return string |
| 212 | */ |
| 213 | public static function newUUIDv4(): string { |
| 214 | $hex = bin2hex( random_bytes( 32 / 2 ) ); |
| 215 | |
| 216 | return sprintf( |
| 217 | '%s-%s-%s-%s-%s', |
| 218 | // "time_low" (32 bits) |
| 219 | substr( $hex, 0, 8 ), |
| 220 | // "time_mid" (16 bits) |
| 221 | substr( $hex, 8, 4 ), |
| 222 | // "time_hi_and_version" (16 bits) |
| 223 | '4' . substr( $hex, 12, 3 ), |
| 224 | // "clk_seq_hi_res" (8 bits, variant is binary 10x) and "clk_seq_low" (8 bits) |
| 225 | dechex( 0x8 | ( hexdec( $hex[15] ) & 0x3 ) ) . $hex[16] . substr( $hex, 17, 2 ), |
| 226 | // "node" (48 bits) |
| 227 | substr( $hex, 19, 12 ) |
| 228 | ); |
| 229 | } |
| 230 | |
| 231 | /** |
| 232 | * Extracts stream name from event in meta.stream field and calls EventLogging::submit |
| 233 | * @param array $event |
| 234 | * @return void |
| 235 | */ |
| 236 | public static function submitEvent( array $event ): void { |
| 237 | $streamName = $event['meta']['stream'] ?? null; |
| 238 | if ( !$streamName ) { |
| 239 | throw new RuntimeException( |
| 240 | 'Cannot submit event: event must have stream name set in meta.stream field.' |
| 241 | ); |
| 242 | } |
| 243 | EventLogging::submit( $streamName, $event ); |
| 244 | } |
| 245 | |
| 246 | } |