Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
81.82% |
153 / 187 |
|
41.67% |
5 / 12 |
CRAP | |
0.00% |
0 / 1 |
RequestLogger | |
81.82% |
153 / 187 |
|
41.67% |
5 / 12 |
71.53 | |
0.00% |
0 / 1 |
hasQueryLogs | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
sample | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
reportLogs | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
addRequest | |
78.95% |
15 / 19 |
|
0.00% |
0 / 1 |
6.34 | |||
appendLastLogPayload | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
getQueryTypesUsed | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
setResultPages | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
buildCirrusSearchRequestEvent | |
88.57% |
93 / 105 |
|
0.00% |
0 / 1 |
19.54 | |||
extractTitleStrings | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
buildLogMessage | |
81.25% |
13 / 16 |
|
0.00% |
0 / 1 |
8.42 | |||
encodeHits | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
getPhpRequestTookMs | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
4.18 |
1 | <?php |
2 | |
3 | namespace CirrusSearch; |
4 | |
5 | use ISearchResultSet; |
6 | use MediaWiki\Context\RequestContext; |
7 | use MediaWiki\Deferred\DeferredUpdates; |
8 | use MediaWiki\Json\FormatJson; |
9 | use MediaWiki\Logger\LoggerFactory; |
10 | use MediaWiki\MediaWikiServices; |
11 | use MediaWiki\User\User; |
12 | use MediaWiki\User\UserIdentity; |
13 | use MediaWiki\WikiMap\WikiMap; |
14 | |
15 | /** |
16 | * Handles logging information about requests made to various destinations, |
17 | * such as monolog, EventBus and statsd. |
18 | * |
19 | * This program is free software; you can redistribute it and/or modify |
20 | * it under the terms of the GNU General Public License as published by |
21 | * the Free Software Foundation; either version 2 of the License, or |
22 | * (at your option) any later version. |
23 | * |
24 | * This program is distributed in the hope that it will be useful, |
25 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
26 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
27 | * GNU General Public License for more details. |
28 | * |
29 | * You should have received a copy of the GNU General Public License along |
30 | * with this program; if not, write to the Free Software Foundation, Inc., |
31 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
32 | * http://www.gnu.org/copyleft/gpl.html |
33 | */ |
34 | class RequestLogger { |
35 | /** |
36 | * @const int max number of results to store in cirrussearch-request logs (per request) |
37 | */ |
38 | private const LOG_MAX_RESULTS = 50; |
39 | |
40 | // If a title hit was given to the user, but the hit was not |
41 | // obtained via ElasticSearch, we won't really know where it came from. |
42 | // We still want to log the fact that the request set generated a hit |
43 | // for this title. When this happens, the hit index field value will be this. |
44 | private const UNKNOWN_HIT_INDEX = '_UNKNOWN_'; |
45 | |
46 | /** |
47 | * @var RequestLog[] Set of requests made |
48 | */ |
49 | private $logs = []; |
50 | |
51 | /** |
52 | * @var string[] Result page ids that were returned to user |
53 | */ |
54 | private $resultTitleStrings = []; |
55 | |
56 | /** |
57 | * @var string[][] Extra payload for the logs, indexed first by the log index |
58 | * in self::$logs, and second by the payload item name. |
59 | */ |
60 | private $extraPayload = []; |
61 | |
62 | /** |
63 | * @return bool True when logs have been generated during the current |
64 | * php execution. |
65 | */ |
66 | public function hasQueryLogs() { |
67 | return (bool)$this->logs; |
68 | } |
69 | |
70 | /** |
71 | * @param float $percentage A value between 0 and 1 indiciating the |
72 | * percentage of calls that should return true. |
73 | * @return bool True on $percentage calls to this method |
74 | */ |
75 | private function sample( $percentage ) { |
76 | if ( $percentage <= 0 ) { |
77 | return false; |
78 | } |
79 | if ( $percentage >= 1 ) { |
80 | return true; |
81 | } |
82 | $rand = random_int( 0, PHP_INT_MAX ) / PHP_INT_MAX; |
83 | return $rand <= $percentage; |
84 | } |
85 | |
86 | /** |
87 | * Summarizes all the requests made in this process and reports |
88 | * them along with the test they belong to. |
89 | */ |
90 | private function reportLogs() { |
91 | if ( $this->logs ) { |
92 | |
93 | // Build the mediawiki/search/requestset event and log it to the (json+EventBus) |
94 | // cirrussearch-request channel. |
95 | LoggerFactory::getInstance( 'cirrussearch-request' )->debug( |
96 | '', $this->buildCirrusSearchRequestEvent() |
97 | ); |
98 | |
99 | // reset logs |
100 | $this->logs = []; |
101 | } |
102 | } |
103 | |
104 | /** |
105 | * @param RequestLog $log The log about a network request to be added |
106 | * @param User|null $user The user performing the request, or null |
107 | * for actions that don't have a user (such as index updates). |
108 | * @param int|null $slowMillis The threshold in ms after which the request |
109 | * will be considered slow. |
110 | * @return array A map of information about the performed request, suitible |
111 | * for use as a psr-3 log context. |
112 | */ |
113 | public function addRequest( RequestLog $log, ?UserIdentity $user = null, $slowMillis = null ) { |
114 | global $wgCirrusSearchLogElasticRequests; |
115 | |
116 | // @todo Is this necessary here? Check on what uses the response value |
117 | $finalContext = $log->getLogVariables() + [ |
118 | 'source' => Util::getExecutionContext(), |
119 | 'executor' => Util::getExecutionId(), |
120 | 'identity' => Util::generateIdentToken(), |
121 | ]; |
122 | if ( $wgCirrusSearchLogElasticRequests ) { |
123 | $this->logs[] = $log; |
124 | if ( count( $this->logs ) === 1 ) { |
125 | DeferredUpdates::addCallableUpdate( function () { |
126 | $this->reportLogs(); |
127 | } ); |
128 | } |
129 | |
130 | $logMessage = $this->buildLogMessage( $log, $finalContext ); |
131 | LoggerFactory::getInstance( 'CirrusSearchRequests' )->debug( $logMessage, $finalContext ); |
132 | if ( $slowMillis && $log->getTookMs() >= $slowMillis ) { |
133 | if ( $user !== null ) { |
134 | $finalContext['user'] = $user->getName(); |
135 | $logMessage .= ' for {user}'; |
136 | } |
137 | LoggerFactory::getInstance( 'CirrusSearchSlowRequests' )->info( $logMessage, $finalContext ); |
138 | } |
139 | } |
140 | |
141 | return $finalContext; |
142 | } |
143 | |
144 | /** |
145 | * @param string $key |
146 | * @param string $value |
147 | */ |
148 | public function appendLastLogPayload( $key, $value ) { |
149 | $idx = count( $this->logs ) - 1; |
150 | if ( isset( $this->logs[$idx] ) ) { |
151 | $this->extraPayload[$idx][$key] = $value; |
152 | } |
153 | } |
154 | |
155 | /** |
156 | * Report the types of queries that were issued |
157 | * within the current request. |
158 | * |
159 | * @return string[] |
160 | */ |
161 | public function getQueryTypesUsed() { |
162 | $types = []; |
163 | foreach ( $this->logs as $log ) { |
164 | $types[] = $log->getQueryType(); |
165 | } |
166 | return array_unique( $types ); |
167 | } |
168 | |
169 | /** |
170 | * This is set externally because we don't have complete control, from the |
171 | * SearchEngine interface, of what is actually sent to the user. Instead hooks |
172 | * receive the final results that will be sent to the user and set them here. |
173 | * |
174 | * Accepts two result sets because some places (Special:Search) perform multiple |
175 | * searches. This can be called multiple times, but only that last call wins. For |
176 | * API's that is correct, for Special:Search a hook catches the final results and |
177 | * sets them here. |
178 | * |
179 | * @param ISearchResultSet[] $matches |
180 | */ |
181 | public function setResultPages( array $matches ) { |
182 | $titleStrings = []; |
183 | foreach ( $matches as $resultSet ) { |
184 | if ( $resultSet !== null ) { |
185 | $titleStrings = array_merge( $titleStrings, $this->extractTitleStrings( $resultSet ) ); |
186 | } |
187 | } |
188 | $this->resultTitleStrings = $titleStrings; |
189 | } |
190 | |
191 | /** |
192 | * Builds and ships a log context of an object that conforms to the |
193 | * JSONSchema mediawiki/search/requestset event schema. |
194 | * |
195 | * @return array mediawiki/search/requestset event object |
196 | */ |
197 | private function buildCirrusSearchRequestEvent() { |
198 | global $wgRequest, $wgServerName; |
199 | $webrequest = $wgRequest; |
200 | |
201 | // for the moment RequestLog::getRequests() is still created in the |
202 | // old format to serve the old log formats, so here we transform the |
203 | // context into the new mediawiki/cirrussearch/request event format. |
204 | // At some point the context should just be created in the correct format. |
205 | $elasticSearchRequests = []; |
206 | |
207 | // If false, then at least one of the Elasticsearch request responses |
208 | // returned was not from cache |
209 | $allRequestsCached = true; |
210 | |
211 | // While building the elasticSearchRequests object, collect |
212 | // a list of all hits from all Elasticsearch requests. |
213 | $allHits = []; |
214 | |
215 | // Iterate over each individual Elasticsearch request |
216 | foreach ( $this->logs as $idx => $elasticSearchRequestLog ) { |
217 | foreach ( $elasticSearchRequestLog->getRequests() as $requestContext ) { |
218 | // Build an entry in the elasticSearchRequests array from each logged |
219 | // Elasticsearch request. |
220 | $requestEntry = []; |
221 | |
222 | Util::setIfDefined( $requestContext, 'query', $requestEntry, 'query', 'strval' ); |
223 | Util::setIfDefined( $requestContext, 'queryType', $requestEntry, 'query_type', 'strval' ); |
224 | Util::setIfDefined( |
225 | $requestContext, 'index', $requestEntry, 'indices', |
226 | // Use list for search indices, not csv string. |
227 | static function ( $v ) { |
228 | return explode( ',', $v ); |
229 | } |
230 | ); |
231 | Util::setIfDefined( |
232 | $requestContext, 'namespaces', $requestEntry, 'namespaces', |
233 | // Make sure namespace values are all integers. |
234 | static function ( $v ) { |
235 | if ( !$v ) { |
236 | return $v; |
237 | } else { |
238 | return array_values( array_map( 'intval', $v ) ); |
239 | } |
240 | } |
241 | ); |
242 | Util::setIfDefined( $requestContext, 'tookMs', $requestEntry, 'request_time_ms', 'intval' ); |
243 | Util::setIfDefined( $requestContext, 'elasticTookMs', $requestEntry, 'search_time_ms', 'intval' ); |
244 | Util::setIfDefined( $requestContext, 'limit', $requestEntry, 'limit', 'intval' ); |
245 | Util::setIfDefined( $requestContext, 'hitsTotal', $requestEntry, 'hits_total', 'intval' ); |
246 | Util::setIfDefined( $requestContext, 'hitsReturned', $requestEntry, 'hits_returned', 'intval' ); |
247 | Util::setIfDefined( $requestContext, 'hitsOffset', $requestEntry, 'hits_offset', 'intval' ); |
248 | Util::setIfDefined( $requestContext, 'suggestion', $requestEntry, 'suggestion', 'strval' ); |
249 | |
250 | // suggestion_requested is true if suggestionRequested or if any suggestion is present. |
251 | $requestEntry['suggestion_requested'] = |
252 | (bool)( $requestContext['suggestionRequested'] ?? isset( $requestContext['suggestion'] ) ); |
253 | |
254 | Util::setIfDefined( $requestContext, 'maxScore', $requestEntry, 'max_score', 'floatval' ); |
255 | |
256 | if ( isset( $this->extraPayload[$idx] ) ) { |
257 | // fields in extraPayload keys must be explicitly |
258 | // set here if we want to include them in the event. |
259 | Util::setIfDefined( |
260 | $this->extraPayload[$idx], 'langdetect', $requestEntry, 'langdetect', 'strval' |
261 | ); |
262 | } |
263 | |
264 | Util::setIfDefined( $requestContext, 'syntax', $requestEntry, 'syntax' ); |
265 | |
266 | // If response was servied from cache. |
267 | $requestEntry['cached'] = (bool)$elasticSearchRequestLog->isCachedResponse(); |
268 | // $wereAllResponsesCached will be used later for |
269 | // deteriming if all responses for the request set were cached. |
270 | if ( !$requestEntry['cached'] ) { |
271 | $allRequestsCached = false; |
272 | } |
273 | |
274 | Util::setIfDefined( |
275 | $requestContext, 'hits', $requestEntry, 'hits', |
276 | function ( $v ) { |
277 | return $this->encodeHits( $v ); |
278 | } |
279 | ); |
280 | if ( isset( $requestEntry['hits'] ) ) { |
281 | $allHits = array_merge( $allHits, $requestEntry['hits'] ); |
282 | } |
283 | |
284 | $elasticSearchRequests[] = $requestEntry; |
285 | } |
286 | } |
287 | |
288 | // Reindex allHits by page titles. It's maybe not perfect, but it's |
289 | // hopefully a "close enough" representation of where our final result |
290 | // set came from. maybe :( |
291 | $allHitsByTitle = []; |
292 | foreach ( $allHits as $hit ) { |
293 | '@phan-var array $hit'; |
294 | $allHitsByTitle[$hit['page_title']] = $hit; |
295 | } |
296 | $resultHits = []; |
297 | // $this->resultTitleStrings give us title hits that were actually provided to the user. |
298 | // Build a subset top level list of hits from each sub request's list of hits. |
299 | $resultTitleStrings = array_slice( $this->resultTitleStrings, 0, self::LOG_MAX_RESULTS ); |
300 | foreach ( $resultTitleStrings as $titleString ) { |
301 | // $allHitsByTitle[$titleString] will have the hit. If there isn't a hit |
302 | // for this title, then this title 'hit' did not come from a ElasticSearch request, |
303 | // so log a 'fake' hit for it. In this case, the index will be self::UNKNOWN_HIT_INDEX. |
304 | $hit = $allHitsByTitle[$titleString] ?? [ |
305 | 'page_title' => (string)$titleString, |
306 | 'index' => self::UNKNOWN_HIT_INDEX, |
307 | ]; |
308 | $resultHits[] = $hit; |
309 | } |
310 | |
311 | $gen = MediaWikiServices::getInstance()->getGlobalIdGenerator(); |
312 | $requestEvent = [ |
313 | // This schema can be found in the mediawiki/event-schemas repository. |
314 | // The $schema URI here should be updated if we increment schema versions. |
315 | '$schema' => '/mediawiki/cirrussearch/request/0.0.1', |
316 | 'meta' => [ |
317 | 'request_id' => $webrequest->getRequestId(), |
318 | 'id' => $gen->newUUIDv4(), |
319 | 'dt' => wfTimestamp( TS_ISO_8601 ), |
320 | 'domain' => $wgServerName, |
321 | 'stream' => 'mediawiki.cirrussearch-request', |
322 | ], |
323 | 'http' => [ |
324 | 'method' => $webrequest->getMethod(), |
325 | 'client_ip' => $webrequest->getIP(), |
326 | 'has_cookies' => $webrequest->getHeader( 'Cookie' ) ? true : false, |
327 | ], |
328 | 'database' => WikiMap::getCurrentWikiId(), |
329 | 'mediawiki_host' => gethostname(), |
330 | 'search_id' => Util::getRequestSetToken(), |
331 | 'source' => Util::getExecutionContext(), |
332 | 'identity' => Util::generateIdentToken(), |
333 | 'request_time_ms' => $this->getPhpRequestTookMs(), |
334 | 'all_elasticsearch_requests_cached' => $allRequestsCached, |
335 | ]; |
336 | |
337 | $webRequestValues = $webrequest->getValues(); |
338 | if ( $webRequestValues ) { |
339 | $requestEvent['params'] = []; |
340 | // Make sure all params are string keys and values |
341 | foreach ( $webRequestValues as $k => $v ) { |
342 | if ( !is_scalar( $v ) ) { |
343 | // This is potentially a multi-dimensional array. JSON is |
344 | // perhaps not the best format, but this gives a good |
345 | // guarantee about always returning a string, and |
346 | // faithfully represents the variety of shapes request |
347 | // parameters can be parsed into. |
348 | $v = FormatJson::encode( $v ); |
349 | } |
350 | $k = $webrequest->normalizeUnicode( $k ); |
351 | $requestEvent['params'][(string)$k] = (string)$v; |
352 | } |
353 | } |
354 | |
355 | // Don't set these fields if there is no data. |
356 | if ( $resultHits ) { |
357 | $requestEvent['hits'] = $resultHits; |
358 | } |
359 | if ( $elasticSearchRequests ) { |
360 | $requestEvent['elasticsearch_requests'] = $elasticSearchRequests; |
361 | } |
362 | // Should always be true, but don't accidently instantiate user testing if somehow |
363 | // it wasn't done previously. |
364 | if ( UserTestingStatus::hasInstance() ) { |
365 | $ut = UserTestingStatus::getInstance(); |
366 | if ( $ut->isActive() ) { |
367 | // Wrap into an array. UserTesting does not support multiple |
368 | // concurrent tests in one request, but the schema was written |
369 | // when that was a possibility. |
370 | $requestEvent['backend_user_tests'] = [ $ut->getTrigger() ]; |
371 | } |
372 | } |
373 | |
374 | // If in webrequests, log these request headers in http.headers. |
375 | $httpRequestHeadersToLog = [ 'accept-language', 'referer', 'user-agent' ]; |
376 | foreach ( $httpRequestHeadersToLog as $header ) { |
377 | if ( $webrequest->getHeader( $header ) ) { |
378 | $requestEvent['http']['request_headers'][$header] = |
379 | (string)$webrequest->normalizeUnicode( $webrequest->getHeader( $header ) ); |
380 | } |
381 | } |
382 | |
383 | return $requestEvent; |
384 | } |
385 | |
386 | /** |
387 | * @param ISearchResultSet $matches |
388 | * @return string[] |
389 | */ |
390 | private function extractTitleStrings( ISearchResultSet $matches ) { |
391 | $strings = []; |
392 | foreach ( $matches as $result ) { |
393 | $strings[] = (string)$result->getTitle(); |
394 | } |
395 | |
396 | return $strings; |
397 | } |
398 | |
399 | /** |
400 | * @param RequestLog $log The request to build a log message about |
401 | * @param array $context Request specific log variables from RequestLog::getLogVariables() |
402 | * @return string a PSR-3 compliant message describing $context |
403 | */ |
404 | private function buildLogMessage( RequestLog $log, array $context ) { |
405 | $message = $log->getDescription(); |
406 | $message .= " against {index} took {tookMs} millis"; |
407 | if ( isset( $context['elasticTookMs'] ) ) { |
408 | $message .= " and {elasticTookMs} Elasticsearch millis"; |
409 | if ( isset( $context['elasticTook2PassMs'] ) && $context['elasticTook2PassMs'] >= 0 ) { |
410 | $message .= " (with 2nd pass: {elasticTook2PassMs} ms)"; |
411 | } |
412 | } |
413 | if ( isset( $context['hitsTotal'] ) ) { |
414 | $message .= ". Found {hitsTotal} total results"; |
415 | $message .= " and returned {hitsReturned} of them starting at {hitsOffset}"; |
416 | } |
417 | if ( isset( $context['namespaces'] ) ) { |
418 | $namespaces = implode( ', ', $context['namespaces'] ); |
419 | $message .= " within these namespaces: $namespaces"; |
420 | } |
421 | if ( isset( $context['suggestion'] ) && strlen( $context['suggestion'] ) > 0 ) { |
422 | $message .= " and suggested '{suggestion}'"; |
423 | } |
424 | $message .= ". Requested via {source} for {identity} by executor {executor}"; |
425 | |
426 | return $message; |
427 | } |
428 | |
429 | /** |
430 | * Enforce all type constraints on the hits field and limit |
431 | * the number of results to the maximum specified. |
432 | * |
433 | * @param array[] $hits |
434 | * @return array[] |
435 | */ |
436 | private function encodeHits( array $hits ) { |
437 | $formattedHits = []; |
438 | foreach ( array_slice( $hits, 0, self::LOG_MAX_RESULTS ) as $hit ) { |
439 | $formattedHit = []; |
440 | Util::setIfDefined( $hit, 'title', $formattedHit, 'page_title', 'strval' ); |
441 | Util::setIfDefined( $hit, 'pageId', $formattedHit, 'page_id', 'intval' ); |
442 | Util::setIfDefined( $hit, 'index', $formattedHit, 'index', 'strval' ); |
443 | Util::setIfDefined( $hit, 'score', $formattedHit, 'score', 'floatval' ); |
444 | Util::setIfDefined( $hit, 'profileName', $formattedHit, 'profile_name', 'strval' ); |
445 | $formattedHits[] = $formattedHit; |
446 | } |
447 | return $formattedHits; |
448 | } |
449 | |
450 | /** |
451 | * Note that this is only accurate for hhvm and php-fpm |
452 | * since they close the request to the user before running |
453 | * deferred updates. |
454 | * |
455 | * @return int The number of ms the php request took |
456 | */ |
457 | private function getPhpRequestTookMs() { |
458 | $timing = RequestContext::getMain()->getTiming(); |
459 | $startMark = $timing->getEntryByName( 'requestStart' ); |
460 | $endMark = $timing->getEntryByName( 'requestShutdown' ); |
461 | if ( $startMark && $endMark ) { |
462 | // should always work, but Timing can return null so |
463 | // fallbacks are provided. |
464 | $tookS = $endMark['startTime'] - $startMark['startTime']; |
465 | } elseif ( isset( $_SERVER['REQUEST_TIME_FLOAT'] ) ) { |
466 | // php >= 5.4 |
467 | $tookS = microtime( true ) - $_SERVER['REQUEST_TIME_FLOAT']; |
468 | } else { |
469 | // php 5.3 |
470 | $tookS = microtime( true ) - $_SERVER['REQUEST_TIME']; |
471 | } |
472 | |
473 | return intval( 1000 * $tookS ); |
474 | } |
475 | } |