Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
81.72% |
152 / 186 |
|
41.67% |
5 / 12 |
CRAP | |
0.00% |
0 / 1 |
RequestLogger | |
81.72% |
152 / 186 |
|
41.67% |
5 / 12 |
71.81 | |
0.00% |
0 / 1 |
hasQueryLogs | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
sample | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
reportLogs | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
addRequest | |
78.95% |
15 / 19 |
|
0.00% |
0 / 1 |
6.34 | |||
appendLastLogPayload | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
getQueryTypesUsed | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
setResultPages | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
buildCirrusSearchRequestEvent | |
88.46% |
92 / 104 |
|
0.00% |
0 / 1 |
19.55 | |||
extractTitleStrings | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
buildLogMessage | |
81.25% |
13 / 16 |
|
0.00% |
0 / 1 |
8.42 | |||
encodeHits | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
getPhpRequestTookMs | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
4.18 |
1 | <?php |
2 | |
3 | namespace CirrusSearch; |
4 | |
5 | use ISearchResultSet; |
6 | use MediaWiki\Context\RequestContext; |
7 | use MediaWiki\Deferred\DeferredUpdates; |
8 | use MediaWiki\Logger\LoggerFactory; |
9 | use MediaWiki\User\User; |
10 | use MediaWiki\User\UserIdentity; |
11 | use MediaWiki\WikiMap\WikiMap; |
12 | use UIDGenerator; |
13 | |
14 | /** |
15 | * Handles logging information about requests made to various destinations, |
16 | * such as monolog, EventBus and statsd. |
17 | * |
18 | * This program is free software; you can redistribute it and/or modify |
19 | * it under the terms of the GNU General Public License as published by |
20 | * the Free Software Foundation; either version 2 of the License, or |
21 | * (at your option) any later version. |
22 | * |
23 | * This program is distributed in the hope that it will be useful, |
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26 | * GNU General Public License for more details. |
27 | * |
28 | * You should have received a copy of the GNU General Public License along |
29 | * with this program; if not, write to the Free Software Foundation, Inc., |
30 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
31 | * http://www.gnu.org/copyleft/gpl.html |
32 | */ |
33 | class RequestLogger { |
34 | /** |
35 | * @const int max number of results to store in cirrussearch-request logs (per request) |
36 | */ |
37 | private const LOG_MAX_RESULTS = 50; |
38 | |
39 | // If a title hit was given to the user, but the hit was not |
40 | // obtained via ElasticSearch, we won't really know where it came from. |
41 | // We still want to log the fact that the request set generated a hit |
42 | // for this title. When this happens, the hit index field value will be this. |
43 | private const UNKNOWN_HIT_INDEX = '_UNKNOWN_'; |
44 | |
45 | /** |
46 | * @var RequestLog[] Set of requests made |
47 | */ |
48 | private $logs = []; |
49 | |
50 | /** |
51 | * @var string[] Result page ids that were returned to user |
52 | */ |
53 | private $resultTitleStrings = []; |
54 | |
55 | /** |
56 | * @var string[][] Extra payload for the logs, indexed first by the log index |
57 | * in self::$logs, and second by the payload item name. |
58 | */ |
59 | private $extraPayload = []; |
60 | |
61 | /** |
62 | * @return bool True when logs have been generated during the current |
63 | * php execution. |
64 | */ |
65 | public function hasQueryLogs() { |
66 | return (bool)$this->logs; |
67 | } |
68 | |
69 | /** |
70 | * @param float $percentage A value between 0 and 1 indiciating the |
71 | * percentage of calls that should return true. |
72 | * @return bool True on $percentage calls to this method |
73 | */ |
74 | private function sample( $percentage ) { |
75 | if ( $percentage <= 0 ) { |
76 | return false; |
77 | } |
78 | if ( $percentage >= 1 ) { |
79 | return true; |
80 | } |
81 | $rand = random_int( 0, PHP_INT_MAX ) / PHP_INT_MAX; |
82 | return $rand <= $percentage; |
83 | } |
84 | |
85 | /** |
86 | * Summarizes all the requests made in this process and reports |
87 | * them along with the test they belong to. |
88 | */ |
89 | private function reportLogs() { |
90 | if ( $this->logs ) { |
91 | |
92 | // Build the mediawiki/search/requestset event and log it to the (json+EventBus) |
93 | // cirrussearch-request channel. |
94 | LoggerFactory::getInstance( 'cirrussearch-request' )->debug( |
95 | '', $this->buildCirrusSearchRequestEvent() |
96 | ); |
97 | |
98 | // reset logs |
99 | $this->logs = []; |
100 | } |
101 | } |
102 | |
103 | /** |
104 | * @param RequestLog $log The log about a network request to be added |
105 | * @param User|null $user The user performing the request, or null |
106 | * for actions that don't have a user (such as index updates). |
107 | * @param int|null $slowMillis The threshold in ms after which the request |
108 | * will be considered slow. |
109 | * @return array A map of information about the performed request, suitible |
110 | * for use as a psr-3 log context. |
111 | */ |
112 | public function addRequest( RequestLog $log, UserIdentity $user = null, $slowMillis = null ) { |
113 | global $wgCirrusSearchLogElasticRequests; |
114 | |
115 | // @todo Is this necessary here? Check on what uses the response value |
116 | $finalContext = $log->getLogVariables() + [ |
117 | 'source' => Util::getExecutionContext(), |
118 | 'executor' => Util::getExecutionId(), |
119 | 'identity' => Util::generateIdentToken(), |
120 | ]; |
121 | if ( $wgCirrusSearchLogElasticRequests ) { |
122 | $this->logs[] = $log; |
123 | if ( count( $this->logs ) === 1 ) { |
124 | DeferredUpdates::addCallableUpdate( function () { |
125 | $this->reportLogs(); |
126 | } ); |
127 | } |
128 | |
129 | $logMessage = $this->buildLogMessage( $log, $finalContext ); |
130 | LoggerFactory::getInstance( 'CirrusSearchRequests' )->debug( $logMessage, $finalContext ); |
131 | if ( $slowMillis && $log->getTookMs() >= $slowMillis ) { |
132 | if ( $user !== null ) { |
133 | $finalContext['user'] = $user->getName(); |
134 | $logMessage .= ' for {user}'; |
135 | } |
136 | LoggerFactory::getInstance( 'CirrusSearchSlowRequests' )->info( $logMessage, $finalContext ); |
137 | } |
138 | } |
139 | |
140 | return $finalContext; |
141 | } |
142 | |
143 | /** |
144 | * @param string $key |
145 | * @param string $value |
146 | */ |
147 | public function appendLastLogPayload( $key, $value ) { |
148 | $idx = count( $this->logs ) - 1; |
149 | if ( isset( $this->logs[$idx] ) ) { |
150 | $this->extraPayload[$idx][$key] = $value; |
151 | } |
152 | } |
153 | |
154 | /** |
155 | * Report the types of queries that were issued |
156 | * within the current request. |
157 | * |
158 | * @return string[] |
159 | */ |
160 | public function getQueryTypesUsed() { |
161 | $types = []; |
162 | foreach ( $this->logs as $log ) { |
163 | $types[] = $log->getQueryType(); |
164 | } |
165 | return array_unique( $types ); |
166 | } |
167 | |
168 | /** |
169 | * This is set externally because we don't have complete control, from the |
170 | * SearchEngine interface, of what is actually sent to the user. Instead hooks |
171 | * receive the final results that will be sent to the user and set them here. |
172 | * |
173 | * Accepts two result sets because some places (Special:Search) perform multiple |
174 | * searches. This can be called multiple times, but only that last call wins. For |
175 | * API's that is correct, for Special:Search a hook catches the final results and |
176 | * sets them here. |
177 | * |
178 | * @param ISearchResultSet[] $matches |
179 | */ |
180 | public function setResultPages( array $matches ) { |
181 | $titleStrings = []; |
182 | foreach ( $matches as $resultSet ) { |
183 | if ( $resultSet !== null ) { |
184 | $titleStrings = array_merge( $titleStrings, $this->extractTitleStrings( $resultSet ) ); |
185 | } |
186 | } |
187 | $this->resultTitleStrings = $titleStrings; |
188 | } |
189 | |
190 | /** |
191 | * Builds and ships a log context of an object that conforms to the |
192 | * JSONSchema mediawiki/search/requestset event schema. |
193 | * |
194 | * @return array mediawiki/search/requestset event object |
195 | */ |
196 | private function buildCirrusSearchRequestEvent() { |
197 | global $wgRequest, $wgServerName; |
198 | $webrequest = $wgRequest; |
199 | |
200 | // for the moment RequestLog::getRequests() is still created in the |
201 | // old format to serve the old log formats, so here we transform the |
202 | // context into the new mediawiki/cirrussearch/request event format. |
203 | // At some point the context should just be created in the correct format. |
204 | $elasticSearchRequests = []; |
205 | |
206 | // If false, then at least one of the Elasticsearch request responses |
207 | // returned was not from cache |
208 | $allRequestsCached = true; |
209 | |
210 | // While building the elasticSearchRequests object, collect |
211 | // a list of all hits from all Elasticsearch requests. |
212 | $allHits = []; |
213 | |
214 | // Iterate over each individual Elasticsearch request |
215 | foreach ( $this->logs as $idx => $elasticSearchRequestLog ) { |
216 | foreach ( $elasticSearchRequestLog->getRequests() as $requestContext ) { |
217 | // Build an entry in the elasticSearchRequests array from each logged |
218 | // Elasticsearch request. |
219 | $requestEntry = []; |
220 | |
221 | Util::setIfDefined( $requestContext, 'query', $requestEntry, 'query', 'strval' ); |
222 | Util::setIfDefined( $requestContext, 'queryType', $requestEntry, 'query_type', 'strval' ); |
223 | Util::setIfDefined( |
224 | $requestContext, 'index', $requestEntry, 'indices', |
225 | // Use list for search indices, not csv string. |
226 | static function ( $v ) { |
227 | return explode( ',', $v ); |
228 | } |
229 | ); |
230 | Util::setIfDefined( |
231 | $requestContext, 'namespaces', $requestEntry, 'namespaces', |
232 | // Make sure namespace values are all integers. |
233 | static function ( $v ) { |
234 | if ( !$v ) { |
235 | return $v; |
236 | } else { |
237 | return array_values( array_map( 'intval', $v ) ); |
238 | } |
239 | } |
240 | ); |
241 | Util::setIfDefined( $requestContext, 'tookMs', $requestEntry, 'request_time_ms', 'intval' ); |
242 | Util::setIfDefined( $requestContext, 'elasticTookMs', $requestEntry, 'search_time_ms', 'intval' ); |
243 | Util::setIfDefined( $requestContext, 'limit', $requestEntry, 'limit', 'intval' ); |
244 | Util::setIfDefined( $requestContext, 'hitsTotal', $requestEntry, 'hits_total', 'intval' ); |
245 | Util::setIfDefined( $requestContext, 'hitsReturned', $requestEntry, 'hits_returned', 'intval' ); |
246 | Util::setIfDefined( $requestContext, 'hitsOffset', $requestEntry, 'hits_offset', 'intval' ); |
247 | Util::setIfDefined( $requestContext, 'suggestion', $requestEntry, 'suggestion', 'strval' ); |
248 | |
249 | // suggestion_requested is true if suggestionRequested or if any suggestion is present. |
250 | $requestEntry['suggestion_requested'] = |
251 | (bool)( $requestContext['suggestionRequested'] ?? isset( $requestContext['suggestion'] ) ); |
252 | |
253 | Util::setIfDefined( $requestContext, 'maxScore', $requestEntry, 'max_score', 'floatval' ); |
254 | |
255 | if ( isset( $this->extraPayload[$idx] ) ) { |
256 | // fields in extraPayload keys must be explicitly |
257 | // set here if we want to include them in the event. |
258 | Util::setIfDefined( |
259 | $this->extraPayload[$idx], 'langdetect', $requestEntry, 'langdetect', 'strval' |
260 | ); |
261 | } |
262 | |
263 | Util::setIfDefined( $requestContext, 'syntax', $requestEntry, 'syntax' ); |
264 | |
265 | // If response was servied from cache. |
266 | $requestEntry['cached'] = (bool)$elasticSearchRequestLog->isCachedResponse(); |
267 | // $wereAllResponsesCached will be used later for |
268 | // deteriming if all responses for the request set were cached. |
269 | if ( !$requestEntry['cached'] ) { |
270 | $allRequestsCached = false; |
271 | } |
272 | |
273 | Util::setIfDefined( |
274 | $requestContext, 'hits', $requestEntry, 'hits', |
275 | function ( $v ) { |
276 | return $this->encodeHits( $v ); |
277 | } |
278 | ); |
279 | if ( isset( $requestEntry['hits'] ) ) { |
280 | $allHits = array_merge( $allHits, $requestEntry['hits'] ); |
281 | } |
282 | |
283 | $elasticSearchRequests[] = $requestEntry; |
284 | } |
285 | } |
286 | |
287 | // Reindex allHits by page titles. It's maybe not perfect, but it's |
288 | // hopefully a "close enough" representation of where our final result |
289 | // set came from. maybe :( |
290 | $allHitsByTitle = []; |
291 | foreach ( $allHits as $hit ) { |
292 | '@phan-var array $hit'; |
293 | $allHitsByTitle[$hit['page_title']] = $hit; |
294 | } |
295 | $resultHits = []; |
296 | // $this->resultTitleStrings give us title hits that were actually provided to the user. |
297 | // Build a subset top level list of hits from each sub request's list of hits. |
298 | $resultTitleStrings = array_slice( $this->resultTitleStrings, 0, self::LOG_MAX_RESULTS ); |
299 | foreach ( $resultTitleStrings as $titleString ) { |
300 | // $allHitsByTitle[$titleString] will have the hit. If there isn't a hit |
301 | // for this title, then this title 'hit' did not come from a ElasticSearch request, |
302 | // so log a 'fake' hit for it. In this case, the index will be self::UNKNOWN_HIT_INDEX. |
303 | $hit = $allHitsByTitle[$titleString] ?? [ |
304 | 'page_title' => (string)$titleString, |
305 | 'index' => self::UNKNOWN_HIT_INDEX, |
306 | ]; |
307 | $resultHits[] = $hit; |
308 | } |
309 | |
310 | $requestEvent = [ |
311 | // This schema can be found in the mediawiki/event-schemas repository. |
312 | // The $schema URI here should be updated if we increment schema versions. |
313 | '$schema' => '/mediawiki/cirrussearch/request/0.0.1', |
314 | 'meta' => [ |
315 | 'request_id' => $webrequest->getRequestId(), |
316 | 'id' => UIDGenerator::newUUIDv4(), |
317 | 'dt' => wfTimestamp( TS_ISO_8601 ), |
318 | 'domain' => $wgServerName, |
319 | 'stream' => 'mediawiki.cirrussearch-request', |
320 | ], |
321 | 'http' => [ |
322 | 'method' => $webrequest->getMethod(), |
323 | 'client_ip' => $webrequest->getIP(), |
324 | 'has_cookies' => $webrequest->getHeader( 'Cookie' ) ? true : false, |
325 | ], |
326 | 'database' => WikiMap::getCurrentWikiId(), |
327 | 'mediawiki_host' => gethostname(), |
328 | 'search_id' => Util::getRequestSetToken(), |
329 | 'source' => Util::getExecutionContext(), |
330 | 'identity' => Util::generateIdentToken(), |
331 | 'request_time_ms' => $this->getPhpRequestTookMs(), |
332 | 'all_elasticsearch_requests_cached' => $allRequestsCached, |
333 | ]; |
334 | |
335 | $webRequestValues = $webrequest->getValues(); |
336 | if ( $webRequestValues ) { |
337 | $requestEvent['params'] = []; |
338 | // Make sure all params are string keys and values |
339 | foreach ( $webRequestValues as $k => $v ) { |
340 | if ( !is_scalar( $v ) ) { |
341 | // This is potentially a multi-dimensional array. JSON is |
342 | // perhaps not the best format, but this gives a good |
343 | // guarantee about always returning a string, and |
344 | // faithfully represents the variety of shapes request |
345 | // parameters can be parsed into. |
346 | $v = \FormatJson::encode( $v ); |
347 | } |
348 | $k = $webrequest->normalizeUnicode( $k ); |
349 | $requestEvent['params'][(string)$k] = (string)$v; |
350 | } |
351 | } |
352 | |
353 | // Don't set these fields if there is no data. |
354 | if ( $resultHits ) { |
355 | $requestEvent['hits'] = $resultHits; |
356 | } |
357 | if ( $elasticSearchRequests ) { |
358 | $requestEvent['elasticsearch_requests'] = $elasticSearchRequests; |
359 | } |
360 | // Should always be true, but don't accidently instantiate user testing if somehow |
361 | // it wasn't done previously. |
362 | if ( UserTestingStatus::hasInstance() ) { |
363 | $ut = UserTestingStatus::getInstance(); |
364 | if ( $ut->isActive() ) { |
365 | // Wrap into an array. UserTesting does not support multiple |
366 | // concurrent tests in one request, but the schema was written |
367 | // when that was a possibility. |
368 | $requestEvent['backend_user_tests'] = [ $ut->getTrigger() ]; |
369 | } |
370 | } |
371 | |
372 | // If in webrequests, log these request headers in http.headers. |
373 | $httpRequestHeadersToLog = [ 'accept-language', 'referer', 'user-agent' ]; |
374 | foreach ( $httpRequestHeadersToLog as $header ) { |
375 | if ( $webrequest->getHeader( $header ) ) { |
376 | $requestEvent['http']['request_headers'][$header] = |
377 | (string)$webrequest->normalizeUnicode( $webrequest->getHeader( $header ) ); |
378 | } |
379 | } |
380 | |
381 | return $requestEvent; |
382 | } |
383 | |
384 | /** |
385 | * @param ISearchResultSet $matches |
386 | * @return string[] |
387 | */ |
388 | private function extractTitleStrings( ISearchResultSet $matches ) { |
389 | $strings = []; |
390 | foreach ( $matches as $result ) { |
391 | $strings[] = (string)$result->getTitle(); |
392 | } |
393 | |
394 | return $strings; |
395 | } |
396 | |
397 | /** |
398 | * @param RequestLog $log The request to build a log message about |
399 | * @param array $context Request specific log variables from RequestLog::getLogVariables() |
400 | * @return string a PSR-3 compliant message describing $context |
401 | */ |
402 | private function buildLogMessage( RequestLog $log, array $context ) { |
403 | $message = $log->getDescription(); |
404 | $message .= " against {index} took {tookMs} millis"; |
405 | if ( isset( $context['elasticTookMs'] ) ) { |
406 | $message .= " and {elasticTookMs} Elasticsearch millis"; |
407 | if ( isset( $context['elasticTook2PassMs'] ) && $context['elasticTook2PassMs'] >= 0 ) { |
408 | $message .= " (with 2nd pass: {elasticTook2PassMs} ms)"; |
409 | } |
410 | } |
411 | if ( isset( $context['hitsTotal'] ) ) { |
412 | $message .= ". Found {hitsTotal} total results"; |
413 | $message .= " and returned {hitsReturned} of them starting at {hitsOffset}"; |
414 | } |
415 | if ( isset( $context['namespaces'] ) ) { |
416 | $namespaces = implode( ', ', $context['namespaces'] ); |
417 | $message .= " within these namespaces: $namespaces"; |
418 | } |
419 | if ( isset( $context['suggestion'] ) && strlen( $context['suggestion'] ) > 0 ) { |
420 | $message .= " and suggested '{suggestion}'"; |
421 | } |
422 | $message .= ". Requested via {source} for {identity} by executor {executor}"; |
423 | |
424 | return $message; |
425 | } |
426 | |
427 | /** |
428 | * Enforce all type constraints on the hits field and limit |
429 | * the number of results to the maximum specified. |
430 | * |
431 | * @param array[] $hits |
432 | * @return array[] |
433 | */ |
434 | private function encodeHits( array $hits ) { |
435 | $formattedHits = []; |
436 | foreach ( array_slice( $hits, 0, self::LOG_MAX_RESULTS ) as $hit ) { |
437 | $formattedHit = []; |
438 | Util::setIfDefined( $hit, 'title', $formattedHit, 'page_title', 'strval' ); |
439 | Util::setIfDefined( $hit, 'pageId', $formattedHit, 'page_id', 'intval' ); |
440 | Util::setIfDefined( $hit, 'index', $formattedHit, 'index', 'strval' ); |
441 | Util::setIfDefined( $hit, 'score', $formattedHit, 'score', 'floatval' ); |
442 | Util::setIfDefined( $hit, 'profileName', $formattedHit, 'profile_name', 'strval' ); |
443 | $formattedHits[] = $formattedHit; |
444 | } |
445 | return $formattedHits; |
446 | } |
447 | |
448 | /** |
449 | * Note that this is only accurate for hhvm and php-fpm |
450 | * since they close the request to the user before running |
451 | * deferred updates. |
452 | * |
453 | * @return int The number of ms the php request took |
454 | */ |
455 | private function getPhpRequestTookMs() { |
456 | $timing = RequestContext::getMain()->getTiming(); |
457 | $startMark = $timing->getEntryByName( 'requestStart' ); |
458 | $endMark = $timing->getEntryByName( 'requestShutdown' ); |
459 | if ( $startMark && $endMark ) { |
460 | // should always work, but Timing can return null so |
461 | // fallbacks are provided. |
462 | $tookS = $endMark['startTime'] - $startMark['startTime']; |
463 | } elseif ( isset( $_SERVER['REQUEST_TIME_FLOAT'] ) ) { |
464 | // php >= 5.4 |
465 | $tookS = microtime( true ) - $_SERVER['REQUEST_TIME_FLOAT']; |
466 | } else { |
467 | // php 5.3 |
468 | $tookS = microtime( true ) - $_SERVER['REQUEST_TIME']; |
469 | } |
470 | |
471 | return intval( 1000 * $tookS ); |
472 | } |
473 | } |