Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
81.72% |
152 / 186 |
|
41.67% |
5 / 12 |
CRAP | |
0.00% |
0 / 1 |
RequestLogger | |
81.72% |
152 / 186 |
|
41.67% |
5 / 12 |
71.81 | |
0.00% |
0 / 1 |
hasQueryLogs | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
sample | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
reportLogs | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
addRequest | |
78.95% |
15 / 19 |
|
0.00% |
0 / 1 |
6.34 | |||
appendLastLogPayload | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
getQueryTypesUsed | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
setResultPages | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
buildCirrusSearchRequestEvent | |
88.46% |
92 / 104 |
|
0.00% |
0 / 1 |
19.55 | |||
extractTitleStrings | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
buildLogMessage | |
81.25% |
13 / 16 |
|
0.00% |
0 / 1 |
8.42 | |||
encodeHits | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
getPhpRequestTookMs | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
4.18 |
1 | <?php |
2 | |
3 | namespace CirrusSearch; |
4 | |
5 | use ISearchResultSet; |
6 | use MediaWiki\Deferred\DeferredUpdates; |
7 | use MediaWiki\Logger\LoggerFactory; |
8 | use MediaWiki\User\User; |
9 | use MediaWiki\User\UserIdentity; |
10 | use MediaWiki\WikiMap\WikiMap; |
11 | use UIDGenerator; |
12 | |
13 | /** |
14 | * Handles logging information about requests made to various destinations, |
15 | * such as monolog, EventBus and statsd. |
16 | * |
17 | * This program is free software; you can redistribute it and/or modify |
18 | * it under the terms of the GNU General Public License as published by |
19 | * the Free Software Foundation; either version 2 of the License, or |
20 | * (at your option) any later version. |
21 | * |
22 | * This program is distributed in the hope that it will be useful, |
23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
25 | * GNU General Public License for more details. |
26 | * |
27 | * You should have received a copy of the GNU General Public License along |
28 | * with this program; if not, write to the Free Software Foundation, Inc., |
29 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
30 | * http://www.gnu.org/copyleft/gpl.html |
31 | */ |
32 | class RequestLogger { |
33 | /** |
34 | * @const int max number of results to store in cirrussearch-request logs (per request) |
35 | */ |
36 | private const LOG_MAX_RESULTS = 50; |
37 | |
38 | // If a title hit was given to the user, but the hit was not |
39 | // obtained via ElasticSearch, we won't really know where it came from. |
40 | // We still want to log the fact that the request set generated a hit |
41 | // for this title. When this happens, the hit index field value will be this. |
42 | private const UNKNOWN_HIT_INDEX = '_UNKNOWN_'; |
43 | |
44 | /** |
45 | * @var RequestLog[] Set of requests made |
46 | */ |
47 | private $logs = []; |
48 | |
49 | /** |
50 | * @var string[] Result page ids that were returned to user |
51 | */ |
52 | private $resultTitleStrings = []; |
53 | |
54 | /** |
55 | * @var string[][] Extra payload for the logs, indexed first by the log index |
56 | * in self::$logs, and second by the payload item name. |
57 | */ |
58 | private $extraPayload = []; |
59 | |
60 | /** |
61 | * @return bool True when logs have been generated during the current |
62 | * php execution. |
63 | */ |
64 | public function hasQueryLogs() { |
65 | return (bool)$this->logs; |
66 | } |
67 | |
68 | /** |
69 | * @param float $percentage A value between 0 and 1 indiciating the |
70 | * percentage of calls that should return true. |
71 | * @return bool True on $percentage calls to this method |
72 | */ |
73 | private function sample( $percentage ) { |
74 | if ( $percentage <= 0 ) { |
75 | return false; |
76 | } |
77 | if ( $percentage >= 1 ) { |
78 | return true; |
79 | } |
80 | $rand = random_int( 0, PHP_INT_MAX ) / PHP_INT_MAX; |
81 | return $rand <= $percentage; |
82 | } |
83 | |
84 | /** |
85 | * Summarizes all the requests made in this process and reports |
86 | * them along with the test they belong to. |
87 | */ |
88 | private function reportLogs() { |
89 | if ( $this->logs ) { |
90 | |
91 | // Build the mediawiki/search/requestset event and log it to the (json+EventBus) |
92 | // cirrussearch-request channel. |
93 | LoggerFactory::getInstance( 'cirrussearch-request' )->debug( |
94 | '', $this->buildCirrusSearchRequestEvent() |
95 | ); |
96 | |
97 | // reset logs |
98 | $this->logs = []; |
99 | } |
100 | } |
101 | |
102 | /** |
103 | * @param RequestLog $log The log about a network request to be added |
104 | * @param User|null $user The user performing the request, or null |
105 | * for actions that don't have a user (such as index updates). |
106 | * @param int|null $slowMillis The threshold in ms after which the request |
107 | * will be considered slow. |
108 | * @return array A map of information about the performed request, suitible |
109 | * for use as a psr-3 log context. |
110 | */ |
111 | public function addRequest( RequestLog $log, UserIdentity $user = null, $slowMillis = null ) { |
112 | global $wgCirrusSearchLogElasticRequests; |
113 | |
114 | // @todo Is this necessary here? Check on what uses the response value |
115 | $finalContext = $log->getLogVariables() + [ |
116 | 'source' => Util::getExecutionContext(), |
117 | 'executor' => Util::getExecutionId(), |
118 | 'identity' => Util::generateIdentToken(), |
119 | ]; |
120 | if ( $wgCirrusSearchLogElasticRequests ) { |
121 | $this->logs[] = $log; |
122 | if ( count( $this->logs ) === 1 ) { |
123 | DeferredUpdates::addCallableUpdate( function () { |
124 | $this->reportLogs(); |
125 | } ); |
126 | } |
127 | |
128 | $logMessage = $this->buildLogMessage( $log, $finalContext ); |
129 | LoggerFactory::getInstance( 'CirrusSearchRequests' )->debug( $logMessage, $finalContext ); |
130 | if ( $slowMillis && $log->getTookMs() >= $slowMillis ) { |
131 | if ( $user !== null ) { |
132 | $finalContext['user'] = $user->getName(); |
133 | $logMessage .= ' for {user}'; |
134 | } |
135 | LoggerFactory::getInstance( 'CirrusSearchSlowRequests' )->info( $logMessage, $finalContext ); |
136 | } |
137 | } |
138 | |
139 | return $finalContext; |
140 | } |
141 | |
142 | /** |
143 | * @param string $key |
144 | * @param string $value |
145 | */ |
146 | public function appendLastLogPayload( $key, $value ) { |
147 | $idx = count( $this->logs ) - 1; |
148 | if ( isset( $this->logs[$idx] ) ) { |
149 | $this->extraPayload[$idx][$key] = $value; |
150 | } |
151 | } |
152 | |
153 | /** |
154 | * Report the types of queries that were issued |
155 | * within the current request. |
156 | * |
157 | * @return string[] |
158 | */ |
159 | public function getQueryTypesUsed() { |
160 | $types = []; |
161 | foreach ( $this->logs as $log ) { |
162 | $types[] = $log->getQueryType(); |
163 | } |
164 | return array_unique( $types ); |
165 | } |
166 | |
167 | /** |
168 | * This is set externally because we don't have complete control, from the |
169 | * SearchEngine interface, of what is actually sent to the user. Instead hooks |
170 | * receive the final results that will be sent to the user and set them here. |
171 | * |
172 | * Accepts two result sets because some places (Special:Search) perform multiple |
173 | * searches. This can be called multiple times, but only that last call wins. For |
174 | * API's that is correct, for Special:Search a hook catches the final results and |
175 | * sets them here. |
176 | * |
177 | * @param ISearchResultSet[] $matches |
178 | */ |
179 | public function setResultPages( array $matches ) { |
180 | $titleStrings = []; |
181 | foreach ( $matches as $resultSet ) { |
182 | if ( $resultSet !== null ) { |
183 | $titleStrings = array_merge( $titleStrings, $this->extractTitleStrings( $resultSet ) ); |
184 | } |
185 | } |
186 | $this->resultTitleStrings = $titleStrings; |
187 | } |
188 | |
189 | /** |
190 | * Builds and ships a log context of an object that conforms to the |
191 | * JSONSchema mediawiki/search/requestset event schema. |
192 | * |
193 | * @return array mediawiki/search/requestset event object |
194 | */ |
195 | private function buildCirrusSearchRequestEvent() { |
196 | global $wgRequest, $wgServerName; |
197 | $webrequest = $wgRequest; |
198 | |
199 | // for the moment RequestLog::getRequests() is still created in the |
200 | // old format to serve the old log formats, so here we transform the |
201 | // context into the new mediawiki/cirrussearch/request event format. |
202 | // At some point the context should just be created in the correct format. |
203 | $elasticSearchRequests = []; |
204 | |
205 | // If false, then at least one of the Elasticsearch request responses |
206 | // returned was not from cache |
207 | $allRequestsCached = true; |
208 | |
209 | // While building the elasticSearchRequests object, collect |
210 | // a list of all hits from all Elasticsearch requests. |
211 | $allHits = []; |
212 | |
213 | // Iterate over each individual Elasticsearch request |
214 | foreach ( $this->logs as $idx => $elasticSearchRequestLog ) { |
215 | foreach ( $elasticSearchRequestLog->getRequests() as $requestContext ) { |
216 | // Build an entry in the elasticSearchRequests array from each logged |
217 | // Elasticsearch request. |
218 | $requestEntry = []; |
219 | |
220 | Util::setIfDefined( $requestContext, 'query', $requestEntry, 'query', 'strval' ); |
221 | Util::setIfDefined( $requestContext, 'queryType', $requestEntry, 'query_type', 'strval' ); |
222 | Util::setIfDefined( |
223 | $requestContext, 'index', $requestEntry, 'indices', |
224 | // Use list for search indices, not csv string. |
225 | static function ( $v ) { |
226 | return explode( ',', $v ); |
227 | } |
228 | ); |
229 | Util::setIfDefined( |
230 | $requestContext, 'namespaces', $requestEntry, 'namespaces', |
231 | // Make sure namespace values are all integers. |
232 | static function ( $v ) { |
233 | if ( !$v ) { |
234 | return $v; |
235 | } else { |
236 | return array_values( array_map( 'intval', $v ) ); |
237 | } |
238 | } |
239 | ); |
240 | Util::setIfDefined( $requestContext, 'tookMs', $requestEntry, 'request_time_ms', 'intval' ); |
241 | Util::setIfDefined( $requestContext, 'elasticTookMs', $requestEntry, 'search_time_ms', 'intval' ); |
242 | Util::setIfDefined( $requestContext, 'limit', $requestEntry, 'limit', 'intval' ); |
243 | Util::setIfDefined( $requestContext, 'hitsTotal', $requestEntry, 'hits_total', 'intval' ); |
244 | Util::setIfDefined( $requestContext, 'hitsReturned', $requestEntry, 'hits_returned', 'intval' ); |
245 | Util::setIfDefined( $requestContext, 'hitsOffset', $requestEntry, 'hits_offset', 'intval' ); |
246 | Util::setIfDefined( $requestContext, 'suggestion', $requestEntry, 'suggestion', 'strval' ); |
247 | |
248 | // suggestion_requested is true if suggestionRequested or if any suggestion is present. |
249 | $requestEntry['suggestion_requested'] = |
250 | (bool)( $requestContext['suggestionRequested'] ?? isset( $requestContext['suggestion'] ) ); |
251 | |
252 | Util::setIfDefined( $requestContext, 'maxScore', $requestEntry, 'max_score', 'floatval' ); |
253 | |
254 | if ( isset( $this->extraPayload[$idx] ) ) { |
255 | // fields in extraPayload keys must be explicitly |
256 | // set here if we want to include them in the event. |
257 | Util::setIfDefined( |
258 | $this->extraPayload[$idx], 'langdetect', $requestEntry, 'langdetect', 'strval' |
259 | ); |
260 | } |
261 | |
262 | Util::setIfDefined( $requestContext, 'syntax', $requestEntry, 'syntax' ); |
263 | |
264 | // If response was servied from cache. |
265 | $requestEntry['cached'] = (bool)$elasticSearchRequestLog->isCachedResponse(); |
266 | // $wereAllResponsesCached will be used later for |
267 | // deteriming if all responses for the request set were cached. |
268 | if ( !$requestEntry['cached'] ) { |
269 | $allRequestsCached = false; |
270 | } |
271 | |
272 | Util::setIfDefined( |
273 | $requestContext, 'hits', $requestEntry, 'hits', |
274 | function ( $v ) { |
275 | return $this->encodeHits( $v ); |
276 | } |
277 | ); |
278 | if ( isset( $requestEntry['hits'] ) ) { |
279 | $allHits = array_merge( $allHits, $requestEntry['hits'] ); |
280 | } |
281 | |
282 | $elasticSearchRequests[] = $requestEntry; |
283 | } |
284 | } |
285 | |
286 | // Reindex allHits by page titles. It's maybe not perfect, but it's |
287 | // hopefully a "close enough" representation of where our final result |
288 | // set came from. maybe :( |
289 | $allHitsByTitle = []; |
290 | foreach ( $allHits as $hit ) { |
291 | '@phan-var array $hit'; |
292 | $allHitsByTitle[$hit['page_title']] = $hit; |
293 | } |
294 | $resultHits = []; |
295 | // $this->resultTitleStrings give us title hits that were actually provided to the user. |
296 | // Build a subset top level list of hits from each sub request's list of hits. |
297 | $resultTitleStrings = array_slice( $this->resultTitleStrings, 0, self::LOG_MAX_RESULTS ); |
298 | foreach ( $resultTitleStrings as $titleString ) { |
299 | // $allHitsByTitle[$titleString] will have the hit. If there isn't a hit |
300 | // for this title, then this title 'hit' did not come from a ElasticSearch request, |
301 | // so log a 'fake' hit for it. In this case, the index will be self::UNKNOWN_HIT_INDEX. |
302 | $hit = $allHitsByTitle[$titleString] ?? [ |
303 | 'page_title' => (string)$titleString, |
304 | 'index' => self::UNKNOWN_HIT_INDEX, |
305 | ]; |
306 | $resultHits[] = $hit; |
307 | } |
308 | |
309 | $requestEvent = [ |
310 | // This schema can be found in the mediawiki/event-schemas repository. |
311 | // The $schema URI here should be updated if we increment schema versions. |
312 | '$schema' => '/mediawiki/cirrussearch/request/0.0.1', |
313 | 'meta' => [ |
314 | 'request_id' => $webrequest->getRequestId(), |
315 | 'id' => UIDGenerator::newUUIDv4(), |
316 | 'dt' => wfTimestamp( TS_ISO_8601 ), |
317 | 'domain' => $wgServerName, |
318 | 'stream' => 'mediawiki.cirrussearch-request', |
319 | ], |
320 | 'http' => [ |
321 | 'method' => $webrequest->getMethod(), |
322 | 'client_ip' => $webrequest->getIP(), |
323 | 'has_cookies' => $webrequest->getHeader( 'Cookie' ) ? true : false, |
324 | ], |
325 | 'database' => WikiMap::getCurrentWikiId(), |
326 | 'mediawiki_host' => gethostname(), |
327 | 'search_id' => Util::getRequestSetToken(), |
328 | 'source' => Util::getExecutionContext(), |
329 | 'identity' => Util::generateIdentToken(), |
330 | 'request_time_ms' => $this->getPhpRequestTookMs(), |
331 | 'all_elasticsearch_requests_cached' => $allRequestsCached, |
332 | ]; |
333 | |
334 | $webRequestValues = $webrequest->getValues(); |
335 | if ( $webRequestValues ) { |
336 | $requestEvent['params'] = []; |
337 | // Make sure all params are string keys and values |
338 | foreach ( $webRequestValues as $k => $v ) { |
339 | if ( !is_scalar( $v ) ) { |
340 | // This is potentially a multi-dimensional array. JSON is |
341 | // perhaps not the best format, but this gives a good |
342 | // guarantee about always returning a string, and |
343 | // faithfully represents the variety of shapes request |
344 | // parameters can be parsed into. |
345 | $v = \FormatJson::encode( $v ); |
346 | } |
347 | $k = $webrequest->normalizeUnicode( $k ); |
348 | $requestEvent['params'][(string)$k] = (string)$v; |
349 | } |
350 | } |
351 | |
352 | // Don't set these fields if there is no data. |
353 | if ( $resultHits ) { |
354 | $requestEvent['hits'] = $resultHits; |
355 | } |
356 | if ( $elasticSearchRequests ) { |
357 | $requestEvent['elasticsearch_requests'] = $elasticSearchRequests; |
358 | } |
359 | // Should always be true, but don't accidently instantiate user testing if somehow |
360 | // it wasn't done previously. |
361 | if ( UserTestingStatus::hasInstance() ) { |
362 | $ut = UserTestingStatus::getInstance(); |
363 | if ( $ut->isActive() ) { |
364 | // Wrap into an array. UserTesting does not support multiple |
365 | // concurrent tests in one request, but the schema was written |
366 | // when that was a possibility. |
367 | $requestEvent['backend_user_tests'] = [ $ut->getTrigger() ]; |
368 | } |
369 | } |
370 | |
371 | // If in webrequests, log these request headers in http.headers. |
372 | $httpRequestHeadersToLog = [ 'accept-language', 'referer', 'user-agent' ]; |
373 | foreach ( $httpRequestHeadersToLog as $header ) { |
374 | if ( $webrequest->getHeader( $header ) ) { |
375 | $requestEvent['http']['request_headers'][$header] = |
376 | (string)$webrequest->normalizeUnicode( $webrequest->getHeader( $header ) ); |
377 | } |
378 | } |
379 | |
380 | return $requestEvent; |
381 | } |
382 | |
383 | /** |
384 | * @param ISearchResultSet $matches |
385 | * @return string[] |
386 | */ |
387 | private function extractTitleStrings( ISearchResultSet $matches ) { |
388 | $strings = []; |
389 | foreach ( $matches as $result ) { |
390 | $strings[] = (string)$result->getTitle(); |
391 | } |
392 | |
393 | return $strings; |
394 | } |
395 | |
396 | /** |
397 | * @param RequestLog $log The request to build a log message about |
398 | * @param array $context Request specific log variables from RequestLog::getLogVariables() |
399 | * @return string a PSR-3 compliant message describing $context |
400 | */ |
401 | private function buildLogMessage( RequestLog $log, array $context ) { |
402 | $message = $log->getDescription(); |
403 | $message .= " against {index} took {tookMs} millis"; |
404 | if ( isset( $context['elasticTookMs'] ) ) { |
405 | $message .= " and {elasticTookMs} Elasticsearch millis"; |
406 | if ( isset( $context['elasticTook2PassMs'] ) && $context['elasticTook2PassMs'] >= 0 ) { |
407 | $message .= " (with 2nd pass: {elasticTook2PassMs} ms)"; |
408 | } |
409 | } |
410 | if ( isset( $context['hitsTotal'] ) ) { |
411 | $message .= ". Found {hitsTotal} total results"; |
412 | $message .= " and returned {hitsReturned} of them starting at {hitsOffset}"; |
413 | } |
414 | if ( isset( $context['namespaces'] ) ) { |
415 | $namespaces = implode( ', ', $context['namespaces'] ); |
416 | $message .= " within these namespaces: $namespaces"; |
417 | } |
418 | if ( isset( $context['suggestion'] ) && strlen( $context['suggestion'] ) > 0 ) { |
419 | $message .= " and suggested '{suggestion}'"; |
420 | } |
421 | $message .= ". Requested via {source} for {identity} by executor {executor}"; |
422 | |
423 | return $message; |
424 | } |
425 | |
426 | /** |
427 | * Enforce all type constraints on the hits field and limit |
428 | * the number of results to the maximum specified. |
429 | * |
430 | * @param array[] $hits |
431 | * @return array[] |
432 | */ |
433 | private function encodeHits( array $hits ) { |
434 | $formattedHits = []; |
435 | foreach ( array_slice( $hits, 0, self::LOG_MAX_RESULTS ) as $hit ) { |
436 | $formattedHit = []; |
437 | Util::setIfDefined( $hit, 'title', $formattedHit, 'page_title', 'strval' ); |
438 | Util::setIfDefined( $hit, 'pageId', $formattedHit, 'page_id', 'intval' ); |
439 | Util::setIfDefined( $hit, 'index', $formattedHit, 'index', 'strval' ); |
440 | Util::setIfDefined( $hit, 'score', $formattedHit, 'score', 'floatval' ); |
441 | Util::setIfDefined( $hit, 'profileName', $formattedHit, 'profile_name', 'strval' ); |
442 | $formattedHits[] = $formattedHit; |
443 | } |
444 | return $formattedHits; |
445 | } |
446 | |
447 | /** |
448 | * Note that this is only accurate for hhvm and php-fpm |
449 | * since they close the request to the user before running |
450 | * deferred updates. |
451 | * |
452 | * @return int The number of ms the php request took |
453 | */ |
454 | private function getPhpRequestTookMs() { |
455 | $timing = \RequestContext::getMain()->getTiming(); |
456 | $startMark = $timing->getEntryByName( 'requestStart' ); |
457 | $endMark = $timing->getEntryByName( 'requestShutdown' ); |
458 | if ( $startMark && $endMark ) { |
459 | // should always work, but Timing can return null so |
460 | // fallbacks are provided. |
461 | $tookS = $endMark['startTime'] - $startMark['startTime']; |
462 | } elseif ( isset( $_SERVER['REQUEST_TIME_FLOAT'] ) ) { |
463 | // php >= 5.4 |
464 | $tookS = microtime( true ) - $_SERVER['REQUEST_TIME_FLOAT']; |
465 | } else { |
466 | // php 5.3 |
467 | $tookS = microtime( true ) - $_SERVER['REQUEST_TIME']; |
468 | } |
469 | |
470 | return intval( 1000 * $tookS ); |
471 | } |
472 | } |