MediaWiki  master
ParsoidOutputAccess.php
Go to the documentation of this file.
1 <?php
20 namespace MediaWiki\Parser\Parsoid;
21 
22 use Config;
23 use HashConfig;
25 use InvalidArgumentException;
26 use Liuggio\StatsdClient\Factory\StatsdDataFactory;
42 use ParserCache;
43 use ParserOptions;
44 use ParserOutput;
45 use Status;
46 use Wikimedia\Parsoid\Config\PageConfig;
47 use Wikimedia\Parsoid\Config\SiteConfig;
48 use Wikimedia\Parsoid\Core\ClientError;
49 use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
50 use Wikimedia\Parsoid\Parsoid;
52 
63  public const PARSOID_PARSER_CACHE_NAME = 'parsoid';
64 
68  private const RENDER_ID_KEY = 'parsoid-render-id';
69 
71  public const OPT_FORCE_PARSE = 1;
72 
76  public const OPT_NO_UPDATE_CACHE = 2;
77 
81  public const OPT_LOG_LINT_DATA = 64;
82 
83  public const CONSTRUCTOR_OPTIONS = [
86  'ParsoidWikiID'
87  ];
88 
90  private $revisionOutputCache;
91 
93  private $parserCache;
94 
96  private $globalIdGenerator;
97 
99  private $stats;
100 
102  private $parsoidCacheConfig;
103 
105  private $parsoid;
106 
108  private $parsoidPageConfigFactory;
109 
111  private $pageLookup;
112 
114  private $revisionLookup;
115 
117  private $siteConfig;
118 
120  private $options;
121 
123  private $parsoidWikiId;
124 
126  private $contentHandlerFactory;
127 
140  public function __construct(
141  ServiceOptions $options,
142  ParserCacheFactory $parserCacheFactory,
143  PageLookup $pageLookup,
144  RevisionLookup $revisionLookup,
145  GlobalIdGenerator $globalIdGenerator,
147  Parsoid $parsoid,
148  SiteConfig $siteConfig,
149  PageConfigFactory $parsoidPageConfigFactory,
150  IContentHandlerFactory $contentHandlerFactory
151  ) {
152  $options->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS );
153  $this->options = $options;
154  $this->parsoidCacheConfig = new HashConfig( $options->get( MainConfigNames::ParsoidCacheConfig ) );
155  $this->revisionOutputCache = $parserCacheFactory
156  ->getRevisionOutputCache( self::PARSOID_PARSER_CACHE_NAME );
157  $this->parserCache = $parserCacheFactory->getParserCache( self::PARSOID_PARSER_CACHE_NAME );
158  $this->pageLookup = $pageLookup;
159  $this->revisionLookup = $revisionLookup;
160  $this->globalIdGenerator = $globalIdGenerator;
161  $this->stats = $stats;
162  $this->parsoid = $parsoid;
163  $this->siteConfig = $siteConfig;
164  $this->parsoidPageConfigFactory = $parsoidPageConfigFactory;
165  $this->contentHandlerFactory = $contentHandlerFactory;
166 
167  // NOTE: This is passed as the "prefix" option to parsoid, which it uses
168  // to locate wiki specific configuration in the baseconfig directory.
169  // This should probably be managed by SiteConfig instead, so
170  // we hopefully will not need it here in the future.
171  $this->parsoidWikiId = $options->get( 'ParsoidWikiID' );
172  }
173 
179  public function supportsContentModel( string $model ): bool {
180  if ( $model === CONTENT_MODEL_WIKITEXT ) {
181  return true;
182  }
183 
184  // Check if the content model serializes to wikitext.
185  // NOTE: We could use isSupportedFormat( CONTENT_FORMAT_WIKITEXT ) if PageContent::getContent()
186  // would specify the format when calling serialize().
187  try {
188  $handler = $this->contentHandlerFactory->getContentHandler( $model );
189  if ( $handler->getDefaultFormat() === CONTENT_FORMAT_WIKITEXT ) {
190  return true;
191  }
192  } catch ( MWUnknownContentModelException $ex ) {
193  // If the content model is not known, it can't be supported.
194  return false;
195  }
196 
197  return $this->siteConfig->getContentModelHandler( $model ) !== null;
198  }
199 
208  public function getParserOutput(
209  PageIdentity $page,
210  ParserOptions $parserOpts,
211  $revision = null,
212  int $options = 0
213  ): Status {
214  [ $page, $revision ] = $this->resolveRevision( $page, $revision );
215  $isOld = $revision->getId() !== $page->getLatest();
216 
217  $statsKey = $isOld ? 'ParsoidOutputAccess.Cache.revision' : 'ParsoidOutputAccess.Cache.parser';
218 
219  if ( !( $options & self::OPT_FORCE_PARSE ) ) {
220  $parserOutput = $this->getCachedParserOutputInternal(
221  $page,
222  $parserOpts,
223  $revision,
224  $isOld,
225  $statsKey
226  );
227 
228  if ( $parserOutput ) {
229  return Status::newGood( $parserOutput );
230  }
231  }
232 
233  $parsoidOptions = [];
234 
235  if ( $options & self::OPT_LOG_LINT_DATA ) {
236  $parsoidOptions += [
237  'logLinterData' => true
238  ];
239  }
240 
241  $mainSlot = $revision->getSlot( SlotRecord::MAIN );
242 
243  $startTime = microtime( true );
244  $status = $this->parse( $page, $parserOpts, $parsoidOptions, $revision );
245  $time = microtime( true ) - $startTime;
246 
247  if ( !$status->isOK() ) {
248  $this->stats->increment( $statsKey . '.save.notok' );
249  } elseif ( $options & self::OPT_NO_UPDATE_CACHE ) {
250  $this->stats->increment( $statsKey . '.save.disabled' );
251  } elseif ( !$this->supportsContentModel( $mainSlot->getModel() ) ) {
252  // TODO: We really want to cache for all supported content models.
253  // But supportsContentModels() lies, because of T324711.
254  // This causes us to render garbage output for all content models, which we shouldn't cache.
255  // NOTE: this will become irrelevant when we implement T311648.
256  $this->stats->increment( $statsKey . '.save.badmodel' );
257  } else {
258  if ( $time > $this->parsoidCacheConfig->get( 'CacheThresholdTime' ) ) {
259  $parserOutput = $status->getValue();
260  $now = $parserOutput->getCacheTime();
261 
262  if ( $isOld ) {
263  $this->revisionOutputCache->save( $parserOutput, $revision, $parserOpts, $now );
264  } else {
265  $this->parserCache->save( $parserOutput, $page, $parserOpts, $now );
266  }
267  $this->stats->increment( $statsKey . '.save.ok' );
268  } else {
269  $this->stats->increment( $statsKey . '.save.skipfast' );
270  }
271  }
272 
273  return $status;
274  }
275 
282  private function parseInternal(
283  PageConfig $pageConfig,
284  array $parsoidOptions
285  ): Status {
286  $defaultOptions = [
287  'pageBundle' => true,
288  'prefix' => $this->parsoidWikiId,
289  'pageName' => $pageConfig->getTitle(),
290  'htmlVariantLanguage' => $pageConfig->getPageLanguage(),
291  'outputContentVersion' => Parsoid::defaultHTMLVersion(),
292  ];
293 
294  try {
295  $startTime = microtime( true );
296  $pageBundle = $this->parsoid->wikitext2html(
297  $pageConfig,
298  $parsoidOptions + $defaultOptions
299  );
300 
301  $parserOutput = PageBundleParserOutputConverter::parserOutputFromPageBundle( $pageBundle );
302  $time = microtime( true ) - $startTime;
303  if ( $time > 3 ) {
304  LoggerFactory::getInstance( 'slow-parsoid' )
305  ->info( 'Parsing {title} was slow, took {time} seconds', [
306  'time' => number_format( $time, 2 ),
307  'title' => $pageConfig->getTitle(),
308  ] );
309  }
310  return Status::newGood( $parserOutput );
311  } catch ( ClientError $e ) {
312  return Status::newFatal( 'parsoid-client-error', $e->getMessage() );
313  } catch ( ResourceLimitExceededException $e ) {
314  return Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
315  }
316  }
317 
326  public function getParsoidRenderID( ParserOutput $parserOutput ): ParsoidRenderID {
327  // XXX: ParserOutput may be coming from the parser cache, so we need to be careful
328  // when we change how we store the render key in the ParserOutput object.
329  $renderId = $parserOutput->getExtensionData( self::RENDER_ID_KEY );
330  if ( !$renderId ) {
331  throw new InvalidArgumentException( 'ParserOutput does not have a render ID' );
332  }
333 
334  return ParsoidRenderID::newFromKey( $renderId );
335  }
336 
344  public function getCachedParserOutput(
345  PageIdentity $page,
346  ParserOptions $parserOpts,
347  $revision = null
348  ): ?ParserOutput {
349  [ $page, $revision ] = $this->resolveRevision( $page, $revision );
350  $isOld = $revision->getId() !== $page->getLatest();
351 
352  $statsKey = $isOld ? 'ParsoidOutputAccess.Cache.revision' : 'ParsoidOutputAccess.Cache.parser';
353 
354  return $this->getCachedParserOutputInternal(
355  $page,
356  $parserOpts,
357  $revision,
358  $isOld,
359  $statsKey
360  );
361  }
362 
372  protected function getCachedParserOutputInternal(
373  PageRecord $page,
374  ParserOptions $parserOpts,
375  ?RevisionRecord $revision,
376  bool $isOld,
377  string $statsKey
378  ): ?ParserOutput {
379  if ( $isOld ) {
380  $parserOutput = $this->revisionOutputCache->get( $revision, $parserOpts );
381  } else {
382  $parserOutput = $this->parserCache->get( $page, $parserOpts );
383  }
384 
385  if ( $parserOutput ) {
386  // Ignore cached ParserOutput if it is incomplete,
387  // because it was stored by an old version of the code.
388  if ( !$parserOutput->getExtensionData( PageBundleParserOutputConverter::PARSOID_PAGE_BUNDLE_KEY )
389  || !$parserOutput->getExtensionData( self::RENDER_ID_KEY )
390  ) {
391  $parserOutput = null;
392  }
393  }
394 
395  if ( $parserOutput ) {
396  $this->stats->increment( $statsKey . '.get.hit' );
397  return $parserOutput;
398  } else {
399  $this->stats->increment( $statsKey . '.get.miss' );
400  return null;
401  }
402  }
403 
404  private function makeDummyParserOutput( string $contentModel ): Status {
405  $msg = "Dummy output. Parsoid does not support content model $contentModel. See T324711.";
406  $output = new ParserOutput( $msg );
407 
408  // This is fast to generate so it's fine not to write this to parser cache.
409  $output->updateCacheExpiry( 0 );
410  // The render ID is required for rendering of dummy output: T311728.
411  $output->setExtensionData( self::RENDER_ID_KEY, '0/dummy-output' );
412 
413  return Status::newGood( $output );
414  }
415 
424  public function parse(
425  PageIdentity $page,
426  ParserOptions $parserOpts,
427  array $parsoidOptions,
428  $revision
429  ): Status {
430  // NOTE: If we have a RevisionRecord already, just use it, there is no need to resolve $page to
431  // a PageRecord (and it may not be possible if the page doesn't exist).
432  if ( !$revision instanceof RevisionRecord ) {
433  [ $page, $revision ] = $this->resolveRevision( $page, $revision );
434  }
435 
436  $mainSlot = $revision->getSlot( SlotRecord::MAIN );
437  $contentModel = $mainSlot->getModel();
438  if ( !$this->supportsContentModel( $contentModel ) ) {
439  // This is a messy fix for T324711. The real solution is T311648.
440  // For now, just return dummy parser output.
441  return $this->makeDummyParserOutput( $contentModel );
442 
443  // TODO: go back to throwing, once RESTbase no longer expects to get a parsoid rendering for
444  //any kind of content (T324711).
445  /*
446  // TODO: throw an internal exception here, convert to HttpError in HtmlOutputRendererHelper.
447  throw new HttpException( 'Parsoid does not support content model ' . $mainSlot->getModel(), 400 );
448  }
449  */
450  }
451 
452  $languageOverride = $parserOpts->getTargetLanguage();
453  $langCode = $languageOverride ? $languageOverride->getCode() : null;
454  $pageConfig = $this->parsoidPageConfigFactory->create(
455  $page,
456  null,
457  $revision,
458  null,
459  $langCode,
460  $this->options->get( MainConfigNames::ParsoidSettings )
461  );
462 
463  $status = $this->parseInternal( $pageConfig, $parsoidOptions );
464 
465  if ( !$status->isOK() ) {
466  return $status;
467  }
468 
469  $parserOutput = $status->getValue();
470 
471  // TODO: when we make tighter integration with Parsoid, render ID should become
472  // a standard ParserOutput property. Nothing else needs it now, so don't generate
473  // it in ParserCache just yet.
474  $revId = $revision->getId();
475  $parsoidRenderId = new ParsoidRenderID( $revId, $this->globalIdGenerator->newUUIDv1() );
476  $parserOutput->setExtensionData( self::RENDER_ID_KEY, $parsoidRenderId->getKey() );
477 
478  // XXX: ParserOutput should just always record the revision ID and timestamp
479  $now = wfTimestampNow();
480  $parserOutput->setCacheRevisionId( $revId );
481  $parserOutput->setCacheTime( $now );
482 
483  return $status;
484  }
485 
492  private function resolveRevision( PageIdentity $page, $revision ): array {
493  if ( !$page instanceof PageRecord ) {
494  $name = "$page";
495  $page = $this->pageLookup->getPageByReference( $page );
496  if ( !$page ) {
497  throw new RevisionAccessException(
498  'Page {name} not found',
499  [ 'name' => $name ]
500  );
501  }
502  }
503 
504  if ( $revision === null ) {
505  $revision = $page->getLatest();
506  }
507 
508  if ( is_int( $revision ) ) {
509  $revId = $revision;
510  $revision = $this->revisionLookup->getRevisionById( $revId );
511 
512  if ( !$revision ) {
513  throw new RevisionAccessException(
514  'Revision {revId} not found',
515  [ 'revId' => $revId ]
516  );
517  }
518  }
519 
520  return [ $page, $revision ];
521  }
522 }
const CONTENT_MODEL_WIKITEXT
Definition: Defines.php:211
const CONTENT_FORMAT_WIKITEXT
Wikitext.
Definition: Defines.php:227
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
getExtensionData()
Get the extension data as: augmentor name => data.
if(!defined('MW_SETUP_CALLBACK'))
The persistent session ID (if any) loaded at startup.
Definition: WebStart.php:82
setCacheRevisionId( $id)
Definition: CacheTime.php:106
setCacheTime( $t)
setCacheTime() sets the timestamp expressing when the page has been rendered.
Definition: CacheTime.php:81
getCacheTime()
Definition: CacheTime.php:67
A Config instance which stores all settings as a member variable.
Definition: HashConfig.php:30
Exception thrown when an unregistered content model is requested.
A class for passing options to services.
PSR-3 logger instance factory.
A class containing constants representing the names of configuration variables.
const ParsoidCacheConfig
Name constant for the ParsoidCacheConfig setting, for use with Config::get()
const ParsoidSettings
Name constant for the ParsoidSettings setting, for use with Config::get()
getRevisionOutputCache(string $name)
Get a RevisionOutputCache instance by $name.
getParserCache(string $name)
Get a ParserCache instance by $name.
Helper class used by MediaWiki to create Parsoid PageConfig objects.
MediaWiki service for getting Parsoid Output objects.
getCachedParserOutputInternal(PageRecord $page, ParserOptions $parserOpts, ?RevisionRecord $revision, bool $isOld, string $statsKey)
parse(PageIdentity $page, ParserOptions $parserOpts, array $parsoidOptions, $revision)
getParsoidRenderID(ParserOutput $parserOutput)
NOTE: This needs to be ParserOutput returned by ->getParserOutput() in this class.
getParserOutput(PageIdentity $page, ParserOptions $parserOpts, $revision=null, int $options=0)
__construct(ServiceOptions $options, ParserCacheFactory $parserCacheFactory, PageLookup $pageLookup, RevisionLookup $revisionLookup, GlobalIdGenerator $globalIdGenerator, IBufferingStatsdDataFactory $stats, Parsoid $parsoid, SiteConfig $siteConfig, PageConfigFactory $parsoidPageConfigFactory, IContentHandlerFactory $contentHandlerFactory)
getCachedParserOutput(PageIdentity $page, ParserOptions $parserOpts, $revision=null)
Represents the identity of a specific rendering of a specific revision at some point in time.
Cache for ParserOutput objects.
Exception representing a failure to look up a revision.
Page revision base class.
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
Cache for ParserOutput objects corresponding to the latest page revisions.
Definition: ParserCache.php:64
Set options of the Parser.
getTargetLanguage()
Target language for the parse.
getExtensionData( $key)
Gets extensions data previously attached to this ParserOutput using setExtensionData().
setExtensionData( $key, $value)
Attaches arbitrary data to this ParserObject.
parse( $text, PageReference $page, ParserOptions $options, $linestart=true, $clearState=true, $revid=null)
Convert wikitext to HTML Do not call this function recursively.
Definition: Parser.php:689
static newFatal( $message,... $parameters)
Factory function for fatal errors.
Definition: StatusValue.php:73
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:85
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition: Status.php:46
Class for getting statistically unique IDs without a central coordinator.
Interface for configuration instances.
Definition: Config.php:30
MediaWiki adaptation of StatsdDataFactory that provides buffering functionality.
Interface for objects (potentially) representing an editable wiki page.
Service for looking up information about wiki pages.
Definition: PageLookup.php:17
Data record representing a page that is (or used to be, or could be) an editable page on a wiki.
Definition: PageRecord.php:24
Service for looking up page revisions.
return true
Definition: router.php:90