Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
71.83% |
51 / 71 |
|
60.00% |
6 / 10 |
CRAP | |
0.00% |
0 / 1 |
LegacyBeaconHandler | |
71.83% |
51 / 71 |
|
60.00% |
6 / 10 |
25.24 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
48.28% |
14 / 29 |
|
0.00% |
0 / 1 |
6.21 | |||
needsReadAccess | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
needsWriteAccess | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
convertEvent | |
84.21% |
16 / 19 |
|
0.00% |
0 / 1 |
5.10 | |||
dateTimeString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
decodeQson | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
getStreamName | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isSchemaAllowed | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getSchemaUri | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\EventLogging\Rest\Handler; |
4 | |
5 | use DateTime; |
6 | use Exception; |
7 | use InvalidArgumentException; |
8 | use JsonException; |
9 | use MediaWiki\Config\ServiceOptions; |
10 | use MediaWiki\Extension\EventLogging\EventSubmitter\EventSubmitter; |
11 | use MediaWiki\Logger\LoggerFactory; |
12 | use MediaWiki\Rest\Handler; |
13 | use Psr\Log\LoggerInterface; |
14 | use UnexpectedValueException; |
15 | use WikiMap; |
16 | use Wikimedia\UUID\GlobalIdGenerator; |
17 | |
18 | // NOTE: As of 2024-07, the only legacy EventLogging schema this needs to support is |
19 | // MediaWikiPingback. Details about this can be found at https://phabricator.wikimedia.org/T323828. |
20 | // In summary, MediaWikiPingback does not use the EventLogging MediaWiki extension to produce events. |
21 | // MediaWikiPingback instrument collects data from 3rd party MediaWiki installs, |
22 | // and we cannot force those 3rd parties to upgrade to newer versions of MediaWiki |
23 | // that produce events directly to eventgate-analytics-external. |
24 | // (See https://gerrit.wikimedia.org/r/c/mediawiki/core/+/938271/ ). |
25 | // |
26 | // The MediaWikiPingback instrument is configured to send events directly to mediawiki.org, so |
27 | // we only need to handle legacy conversion of events from mediawiki.org. |
28 | // |
29 | // Once we are confident that there are sufficiently few remaining 3rd party MediaWiki installs |
30 | // out there that send events using this legacy endpoint, we can remove this endpoint and related |
31 | // code (EventLogging extension's EventLoggingLegacyConverter) entirely. |
32 | |
33 | /** |
34 | * GET /eventlogging/v0/beacon/?{qson_enconded_legacy_event} |
35 | * |
36 | * Converts legacy EventLogging events into WMF Event Platform compatible ones and submits |
37 | * them using the provided EventSubmitter. |
38 | * |
39 | * Expects that the incoming HTTP query string is a 'qson' event (URL encoded JSON string). |
40 | * This event will be parsed, converted and posted to EVENT_INTAKE_URL env var, |
41 | * or the local eventgate-analytics-external service. |
42 | * |
43 | * This class mostly exists to aid in the final decommissioning of the eventlogging python backend |
44 | * and associated components and data pipelines |
45 | * (varnishkafka, Refine eventlogging_analytics job in analytics hadoop cluster, etc.) |
46 | * |
47 | * It attempts to replicate some of the logic in eventlogging/parse.py |
48 | * https://gerrit.wikimedia.org/r/plugins/gitiles/eventlogging/+/refs/heads/master/eventlogging/parse.py |
49 | * and the WMF configured varnishkafka logger. However, because varnishkafka has |
50 | * access to data that is not provided by the producer client (e.g. seqId, client IP, etc.), |
51 | * This class does not support those kind of features. It does its best to translate |
52 | * the client produced legacy event into a WMF Event Platform compatible one. |
53 | * |
54 | * NOTE: The varnishkafka log format for eventlogging was: |
55 | * '%q %l %n %{%FT%T}t %{X-Client-IP}o "%{User-agent}i"' |
56 | * |
57 | * == Differences from original eventlogging/parse.py + format |
58 | * |
59 | * - seqId %n is not supported. |
60 | * |
61 | * - recvFrom is populated from REMOTE_HOST or REMOTE_ADDR, instead of the varnish cache hostname %l. |
62 | * |
63 | * - Receive timestamp is generated here, instead of the cache host request receive timestamp %t. |
64 | * |
65 | * - Client IP is not supported. |
66 | * |
67 | * - EventLogging Capsule id field will be set to a random uuid4, |
68 | * instead of a uuid5 built from event content. |
69 | */ |
70 | class LegacyBeaconHandler extends Handler { |
71 | |
72 | /** |
73 | * MediaWiki Config key EventLoggingLegacyBeaconAllowedWikiIds. |
74 | */ |
75 | private const ALLOWED_WIKI_IDS_CONFIG_KEY = 'EventLoggingLegacyBeaconAllowedWikiIds'; |
76 | |
77 | public const CONSTRUCTOR_OPTIONS = [ |
78 | self::ALLOWED_WIKI_IDS_CONFIG_KEY |
79 | ]; |
80 | |
81 | /** |
82 | * Maps legacy EventLogging schema names to the migrated WMF Event Platform |
83 | * schema version to be used. |
84 | * |
85 | * A schema must be declared here in order for it to be allowed to be produced, |
86 | * otherwise it will be rejected. |
87 | * |
88 | * NOTE: This is hardcoded here (instead of parameterized in config) because |
89 | * we do not intend to ever add entries to this. Hopefully RestApiLegacyBeacon |
90 | * can be removed entirely in a few years. |
91 | * |
92 | * @var array|string[] |
93 | */ |
94 | public static array $schemaVersions = [ |
95 | 'MediaWikiPingback' => '1.0.0', |
96 | 'Test' => '1.2.0', |
97 | ]; |
98 | |
99 | /** |
100 | * @var array|mixed |
101 | */ |
102 | private array $allowedWikiIds; |
103 | |
104 | /** |
105 | * @var EventSubmitter |
106 | */ |
107 | private EventSubmitter $eventSubmitter; |
108 | |
109 | /** |
110 | * @var GlobalIdGenerator |
111 | */ |
112 | private GlobalIdGenerator $globalIdGenerator; |
113 | |
114 | /** |
115 | * @var LoggerInterface |
116 | */ |
117 | private LoggerInterface $logger; |
118 | |
119 | /** |
120 | * @param ServiceOptions $options |
121 | * @param EventSubmitter $eventSubmitter |
122 | * @param GlobalIdGenerator $globalIdGenerator |
123 | */ |
124 | public function __construct( |
125 | ServiceOptions $options, |
126 | EventSubmitter $eventSubmitter, |
127 | GlobalIdGenerator $globalIdGenerator |
128 | ) { |
129 | $options->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS ); |
130 | |
131 | $this->eventSubmitter = $eventSubmitter; |
132 | $this->globalIdGenerator = $globalIdGenerator; |
133 | $this->allowedWikiIds = $options->get( self::ALLOWED_WIKI_IDS_CONFIG_KEY ); |
134 | $this->logger = LoggerFactory::getInstance( self::class ); |
135 | } |
136 | |
137 | public function execute() { |
138 | // we will always return 204, no matter what. |
139 | $response = $this->getResponseFactory()->createNoContent(); |
140 | |
141 | // Restrict this API endpoint to allowedWikiIds. |
142 | $wiki = WikiMap::getCurrentWikiId(); |
143 | if ( !in_array( $wiki, $this->allowedWikiIds ) ) { |
144 | $this->logger->error( "Cannot forward legacy event: LegacyEventBeacon is disabled on $wiki." ); |
145 | return $response; |
146 | } |
147 | |
148 | // Decode the 'event' out of the qson encoded query string |
149 | $queryString = $this->getRequest()->getUri()->getQuery(); |
150 | try { |
151 | $decodedEvent = self::decodeQson( $queryString ); |
152 | } catch ( Exception $e ) { |
153 | $this->logger->error( |
154 | "Failed decoding query string as 'qson' event: " . $e->getMessage(), |
155 | [ 'exception' => $e ] |
156 | ); |
157 | return $response; |
158 | } |
159 | |
160 | // Convert the event to WMF Event Platform compatible |
161 | try { |
162 | $event = self::convertEvent( |
163 | $decodedEvent, |
164 | new DateTime(), |
165 | $this->getRequest()->getServerParams()['REMOTE_HOST'] ?? null, |
166 | $this->getRequest()->getHeader( 'user-agent' )[0] ?? null, |
167 | $this->globalIdGenerator->newUUIDv4(), |
168 | ); |
169 | } catch ( Exception $e ) { |
170 | $this->logger->error( |
171 | 'Failed converting event from legacy EventLogging to WMF Event Platform compatible: ' . |
172 | $e->getMessage(), |
173 | [ 'exception' => $e ] |
174 | ); |
175 | return $response; |
176 | } |
177 | |
178 | // submit event (likely in a DeferredUpdate via EventBusEventSubmitter). |
179 | $this->eventSubmitter->submit( $event['meta']['stream'], $event ); |
180 | |
181 | // 204 HTTP response |
182 | return $response; |
183 | } |
184 | |
185 | public function needsReadAccess(): bool { |
186 | return false; |
187 | } |
188 | |
189 | public function needsWriteAccess(): bool { |
190 | return false; |
191 | } |
192 | |
193 | /** |
194 | * Converts the legacy EventLogging event to a WMF Event Platform compatible one. |
195 | * |
196 | * @param array $event |
197 | * @param DateTime|null $dt |
198 | * @param string|null $recvFrom |
199 | * @param string|null $userAgent |
200 | * @param string|null $uuid |
201 | * @return array |
202 | * @throws InvalidArgumentException |
203 | * @throws UnexpectedValueException |
204 | */ |
205 | public static function convertEvent( |
206 | array $event, |
207 | ?DateTime $dt = null, |
208 | ?string $recvFrom = null, |
209 | ?string $userAgent = null, |
210 | ?string $uuid = null |
211 | ): array { |
212 | if ( !isset( $event['schema'] ) ) { |
213 | throw new InvalidArgumentException( |
214 | 'Event is missing \'schema\' field. This is required to convert to WMF Event Platform event.' |
215 | ); |
216 | } |
217 | |
218 | $event['$schema'] = self::getSchemaUri( $event['schema'] ); |
219 | $event['meta'] = [ |
220 | 'stream' => self::getStreamName( $event['schema'] ), |
221 | ]; |
222 | |
223 | if ( $uuid != null ) { |
224 | $event['uuid'] = $uuid; |
225 | } |
226 | |
227 | $dt ??= new DateTime(); |
228 | // NOTE: `client_dt` is 'legacy' event time. |
229 | $event['client_dt'] = self::dateTimeString( $dt ); |
230 | |
231 | if ( $recvFrom !== null ) { |
232 | $event['recvFrom'] = $recvFrom; |
233 | } |
234 | |
235 | if ( $userAgent !== null ) { |
236 | $event['http'] = [ |
237 | 'request_headers' => [ 'user-agent' => $userAgent ], |
238 | ]; |
239 | } |
240 | |
241 | return $event; |
242 | } |
243 | |
244 | /** |
245 | * Returns an ISO-8601 UTC datetime string with 'zulu' timezone notation. |
246 | * If $dt is not given, returns for current timestamp. |
247 | * |
248 | * @param DateTime|null $dt |
249 | * @return string |
250 | */ |
251 | public static function dateTimeString( ?DateTime $dt ): string { |
252 | return $dt->format( 'Y-m-d\TH:i:s.' ) . substr( $dt->format( 'u' ), 0, 3 ) . 'Z'; |
253 | } |
254 | |
255 | /** |
256 | * 'qson' is a term found in the legacy eventlogging python codebase. It is URL encoded JSON. |
257 | * This parses URL encoded json data into a PHP assoc array. |
258 | * |
259 | * @param string $data |
260 | * @return array |
261 | * @throws JsonException |
262 | */ |
263 | public static function decodeQson( string $data ): array { |
264 | $decoded = rawurldecode( trim( $data, '?&;' ) ); |
265 | return json_decode( |
266 | $decoded, |
267 | true, |
268 | 512, |
269 | JSON_THROW_ON_ERROR, |
270 | ); |
271 | } |
272 | |
273 | /** |
274 | * Converts legacy EventLogging schema name to migrated Event Platform stream name. |
275 | * |
276 | * @param string $schemaName |
277 | * @return string |
278 | */ |
279 | public static function getStreamName( string $schemaName ): string { |
280 | return 'eventlogging_' . $schemaName; |
281 | } |
282 | |
283 | public static function isSchemaAllowed( string $schemaName ): bool { |
284 | return array_key_exists( $schemaName, self::$schemaVersions ); |
285 | } |
286 | |
287 | /** |
288 | * Converts the EventLogging legacy $schemaName to the migrated WMF |
289 | * Event Platform schema URI. This expects that the migrated schema URI is at |
290 | * /analytics/legacy/<schemaName>/<version> |
291 | * |
292 | * @param string $schemaName |
293 | * @return string |
294 | */ |
295 | public static function getSchemaUri( string $schemaName ): string { |
296 | if ( !self::isSchemaAllowed( $schemaName ) ) { |
297 | throw new UnexpectedValueException( |
298 | "$schemaName is not in the list of allowed legacy schemas." |
299 | ); |
300 | } |
301 | |
302 | $version = self::$schemaVersions[$schemaName]; |
303 | return '/analytics/legacy/' . strtolower( $schemaName ) . '/' . $version; |
304 | } |
305 | } |