MediaWiki  master
ParsoidOutputAccess.php
Go to the documentation of this file.
1 <?php
20 namespace MediaWiki\Parser\Parsoid;
21 
22 use Config;
23 use HashConfig;
25 use InvalidArgumentException;
26 use Liuggio\StatsdClient\Factory\StatsdDataFactory;
42 use ParserCache;
43 use ParserOptions;
44 use ParserOutput;
45 use Status;
46 use Wikimedia\Parsoid\Config\PageConfig;
47 use Wikimedia\Parsoid\Config\SiteConfig;
48 use Wikimedia\Parsoid\Core\ClientError;
49 use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
50 use Wikimedia\Parsoid\Parsoid;
52 
63  public const PARSOID_PARSER_CACHE_NAME = 'parsoid';
64 
68  private const RENDER_ID_KEY = 'parsoid-render-id';
69 
71  public const OPT_FORCE_PARSE = 1;
72 
76  public const OPT_NO_UPDATE_CACHE = 2;
77 
81  public const OPT_LOG_LINT_DATA = 64;
82 
83  public const CONSTRUCTOR_OPTIONS = [
86  'ParsoidWikiID'
87  ];
88 
90  private $revisionOutputCache;
91 
93  private $parserCache;
94 
96  private $globalIdGenerator;
97 
99  private $stats;
100 
102  private $parsoidCacheConfig;
103 
105  private $parsoid;
106 
108  private $parsoidPageConfigFactory;
109 
111  private $pageLookup;
112 
114  private $revisionLookup;
115 
117  private $siteConfig;
118 
120  private $options;
121 
123  private $parsoidWikiId;
124 
126  private $contentHandlerFactory;
127 
140  public function __construct(
141  ServiceOptions $options,
142  ParserCacheFactory $parserCacheFactory,
143  PageLookup $pageLookup,
144  RevisionLookup $revisionLookup,
145  GlobalIdGenerator $globalIdGenerator,
147  Parsoid $parsoid,
148  SiteConfig $siteConfig,
149  PageConfigFactory $parsoidPageConfigFactory,
150  IContentHandlerFactory $contentHandlerFactory
151  ) {
152  $options->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS );
153  $this->options = $options;
154  $this->parsoidCacheConfig = new HashConfig( $options->get( MainConfigNames::ParsoidCacheConfig ) );
155  $this->revisionOutputCache = $parserCacheFactory
156  ->getRevisionOutputCache( self::PARSOID_PARSER_CACHE_NAME );
157  $this->parserCache = $parserCacheFactory->getParserCache( self::PARSOID_PARSER_CACHE_NAME );
158  $this->pageLookup = $pageLookup;
159  $this->revisionLookup = $revisionLookup;
160  $this->globalIdGenerator = $globalIdGenerator;
161  $this->stats = $stats;
162  $this->parsoid = $parsoid;
163  $this->siteConfig = $siteConfig;
164  $this->parsoidPageConfigFactory = $parsoidPageConfigFactory;
165  $this->contentHandlerFactory = $contentHandlerFactory;
166 
167  // NOTE: This is passed as the "prefix" option to parsoid, which it uses
168  // to locate wiki specific configuration in the baseconfig directory.
169  // This should probably be managed by SiteConfig instead, so
170  // we hopefully will not need it here in the future.
171  $this->parsoidWikiId = $options->get( 'ParsoidWikiID' );
172  }
173 
179  public function supportsContentModel( string $model ): bool {
180  if ( $model === CONTENT_MODEL_WIKITEXT ) {
181  return true;
182  }
183 
184  // Check if the content model serializes to wikitext.
185  // NOTE: We could use isSupportedFormat( CONTENT_FORMAT_WIKITEXT ) if PageContent::getContent()
186  // would specify the format when calling serialize().
187  try {
188  $handler = $this->contentHandlerFactory->getContentHandler( $model );
189  if ( $handler->getDefaultFormat() === CONTENT_FORMAT_WIKITEXT ) {
190  return true;
191  }
192  } catch ( MWUnknownContentModelException $ex ) {
193  // If the content model is not known, it can't be supported.
194  return false;
195  }
196 
197  return $this->siteConfig->getContentModelHandler( $model ) !== null;
198  }
199 
208  public function getParserOutput(
209  PageIdentity $page,
210  ParserOptions $parserOpts,
211  $revision = null,
212  int $options = 0
213  ): Status {
214  [ $page, $revision ] = $this->resolveRevision( $page, $revision );
215  $isOld = $revision->getId() !== $page->getLatest();
216 
217  $statsKey = $isOld ? 'ParsoidOutputAccess.Cache.revision' : 'ParsoidOutputAccess.Cache.parser';
218 
219  if ( !( $options & self::OPT_FORCE_PARSE ) ) {
220  $parserOutput = $this->getCachedParserOutputInternal(
221  $page,
222  $parserOpts,
223  $revision,
224  $isOld,
225  $statsKey
226  );
227 
228  if ( $parserOutput ) {
229  return Status::newGood( $parserOutput );
230  }
231  }
232 
233  $parsoidOptions = [];
234 
235  if ( $options & self::OPT_LOG_LINT_DATA ) {
236  $parsoidOptions += [
237  'logLinterData' => true
238  ];
239  }
240 
241  $mainSlot = $revision->getSlot( SlotRecord::MAIN );
242 
243  $startTime = microtime( true );
244  $status = $this->parse( $page, $parserOpts, $parsoidOptions, $revision );
245  $time = microtime( true ) - $startTime;
246 
247  if ( !$status->isOK() ) {
248  $this->stats->increment( $statsKey . '.save.notok' );
249  } elseif ( $options & self::OPT_NO_UPDATE_CACHE ) {
250  $this->stats->increment( $statsKey . '.save.disabled' );
251  } elseif ( !$this->supportsContentModel( $mainSlot->getModel() ) ) {
252  // TODO: We really want to cache for all supported content models.
253  // But supportsContentModels() lies, because of T324711.
254  // This causes us to render garbage output for all content models, which we shouldn't cache.
255  // NOTE: this will become irrelevant when we implement T311648.
256  $this->stats->increment( $statsKey . '.save.badmodel' );
257  } else {
258  if ( $time > $this->parsoidCacheConfig->get( 'CacheThresholdTime' ) ) {
259  $parserOutput = $status->getValue();
260  $now = $parserOutput->getCacheTime();
261 
262  if ( $isOld ) {
263  $this->revisionOutputCache->save( $parserOutput, $revision, $parserOpts, $now );
264  } else {
265  $this->parserCache->save( $parserOutput, $page, $parserOpts, $now );
266  }
267  $this->stats->increment( $statsKey . '.save.ok' );
268  } else {
269  $this->stats->increment( $statsKey . '.save.skipfast' );
270  }
271  }
272 
273  return $status;
274  }
275 
282  private function parseInternal(
283  PageConfig $pageConfig,
284  array $parsoidOptions
285  ): Status {
286  $defaultOptions = [
287  'pageBundle' => true,
288  'prefix' => $this->parsoidWikiId,
289  'pageName' => $pageConfig->getTitle(),
290  'htmlVariantLanguage' => $pageConfig->getPageLanguageBcp47(),
291  'outputContentVersion' => Parsoid::defaultHTMLVersion(),
292  ];
293 
294  try {
295  $startTime = microtime( true );
296  $parserOutput = new ParserOutput();
297  $pageBundle = $this->parsoid->wikitext2html(
298  $pageConfig,
299  $parsoidOptions + $defaultOptions,
300  $headers,
301  $parserOutput
302  );
303 
304  $parserOutput = PageBundleParserOutputConverter::parserOutputFromPageBundle( $pageBundle, $parserOutput );
305  $time = microtime( true ) - $startTime;
306  if ( $time > 3 ) {
307  LoggerFactory::getInstance( 'slow-parsoid' )
308  ->info( 'Parsing {title} was slow, took {time} seconds', [
309  'time' => number_format( $time, 2 ),
310  'title' => $pageConfig->getTitle(),
311  ] );
312  }
313  return Status::newGood( $parserOutput );
314  } catch ( ClientError $e ) {
315  return Status::newFatal( 'parsoid-client-error', $e->getMessage() );
316  } catch ( ResourceLimitExceededException $e ) {
317  return Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
318  }
319  }
320 
329  public function getParsoidRenderID( ParserOutput $parserOutput ): ParsoidRenderID {
330  // XXX: ParserOutput may be coming from the parser cache, so we need to be careful
331  // when we change how we store the render key in the ParserOutput object.
332  $renderId = $parserOutput->getExtensionData( self::RENDER_ID_KEY );
333  if ( !$renderId ) {
334  throw new InvalidArgumentException( 'ParserOutput does not have a render ID' );
335  }
336 
337  return ParsoidRenderID::newFromKey( $renderId );
338  }
339 
347  public function getCachedParserOutput(
348  PageIdentity $page,
349  ParserOptions $parserOpts,
350  $revision = null
351  ): ?ParserOutput {
352  [ $page, $revision ] = $this->resolveRevision( $page, $revision );
353  $isOld = $revision->getId() !== $page->getLatest();
354 
355  $statsKey = $isOld ? 'ParsoidOutputAccess.Cache.revision' : 'ParsoidOutputAccess.Cache.parser';
356 
357  return $this->getCachedParserOutputInternal(
358  $page,
359  $parserOpts,
360  $revision,
361  $isOld,
362  $statsKey
363  );
364  }
365 
375  protected function getCachedParserOutputInternal(
376  PageRecord $page,
377  ParserOptions $parserOpts,
378  ?RevisionRecord $revision,
379  bool $isOld,
380  string $statsKey
381  ): ?ParserOutput {
382  if ( $isOld ) {
383  $parserOutput = $this->revisionOutputCache->get( $revision, $parserOpts );
384  } else {
385  $parserOutput = $this->parserCache->get( $page, $parserOpts );
386  }
387 
388  if ( $parserOutput ) {
389  // Ignore cached ParserOutput if it is incomplete,
390  // because it was stored by an old version of the code.
391  if ( !$parserOutput->getExtensionData( PageBundleParserOutputConverter::PARSOID_PAGE_BUNDLE_KEY )
392  || !$parserOutput->getExtensionData( self::RENDER_ID_KEY )
393  ) {
394  $parserOutput = null;
395  }
396  }
397 
398  if ( $parserOutput ) {
399  $this->stats->increment( $statsKey . '.get.hit' );
400  return $parserOutput;
401  } else {
402  $this->stats->increment( $statsKey . '.get.miss' );
403  return null;
404  }
405  }
406 
407  private function makeDummyParserOutput( string $contentModel ): Status {
408  $msg = "Dummy output. Parsoid does not support content model $contentModel. See T324711.";
409  $output = new ParserOutput( $msg );
410 
411  // This is fast to generate so it's fine not to write this to parser cache.
412  $output->updateCacheExpiry( 0 );
413  // The render ID is required for rendering of dummy output: T311728.
414  $output->setExtensionData( self::RENDER_ID_KEY, '0/dummy-output' );
415 
416  return Status::newGood( $output );
417  }
418 
427  public function parse(
428  PageIdentity $page,
429  ParserOptions $parserOpts,
430  array $parsoidOptions,
431  $revision
432  ): Status {
433  // NOTE: If we have a RevisionRecord already, just use it, there is no need to resolve $page to
434  // a PageRecord (and it may not be possible if the page doesn't exist).
435  if ( !$revision instanceof RevisionRecord ) {
436  [ $page, $revision ] = $this->resolveRevision( $page, $revision );
437  }
438 
439  $mainSlot = $revision->getSlot( SlotRecord::MAIN );
440  $contentModel = $mainSlot->getModel();
441  if ( !$this->supportsContentModel( $contentModel ) ) {
442  // This is a messy fix for T324711. The real solution is T311648.
443  // For now, just return dummy parser output.
444  return $this->makeDummyParserOutput( $contentModel );
445 
446  // TODO: go back to throwing, once RESTbase no longer expects to get a parsoid rendering for
447  //any kind of content (T324711).
448  /*
449  // TODO: throw an internal exception here, convert to HttpError in HtmlOutputRendererHelper.
450  throw new HttpException( 'Parsoid does not support content model ' . $mainSlot->getModel(), 400 );
451  }
452  */
453  }
454 
455  $languageOverride = $parserOpts->getTargetLanguage();
456  $pageConfig = $this->parsoidPageConfigFactory->create(
457  $page,
458  null,
459  $revision,
460  null,
461  $languageOverride,
462  $this->options->get( MainConfigNames::ParsoidSettings )
463  );
464 
465  $status = $this->parseInternal( $pageConfig, $parsoidOptions );
466 
467  if ( !$status->isOK() ) {
468  return $status;
469  }
470 
471  $parserOutput = $status->getValue();
472 
473  // TODO: when we make tighter integration with Parsoid, render ID should become
474  // a standard ParserOutput property. Nothing else needs it now, so don't generate
475  // it in ParserCache just yet.
476  $revId = $revision->getId();
477  $parsoidRenderId = new ParsoidRenderID( $revId, $this->globalIdGenerator->newUUIDv1() );
478  $parserOutput->setExtensionData( self::RENDER_ID_KEY, $parsoidRenderId->getKey() );
479 
480  // XXX: ParserOutput should just always record the revision ID and timestamp
481  $now = wfTimestampNow();
482  $parserOutput->setCacheRevisionId( $revId );
483  $parserOutput->setCacheTime( $now );
484 
485  return $status;
486  }
487 
494  private function resolveRevision( PageIdentity $page, $revision ): array {
495  if ( !$page instanceof PageRecord ) {
496  $name = "$page";
497  $page = $this->pageLookup->getPageByReference( $page );
498  if ( !$page ) {
499  throw new RevisionAccessException(
500  'Page {name} not found',
501  [ 'name' => $name ]
502  );
503  }
504  }
505 
506  if ( $revision === null ) {
507  $revision = $page->getLatest();
508  }
509 
510  if ( is_int( $revision ) ) {
511  $revId = $revision;
512  $revision = $this->revisionLookup->getRevisionById( $revId );
513 
514  if ( !$revision ) {
515  throw new RevisionAccessException(
516  'Revision {revId} not found',
517  [ 'revId' => $revId ]
518  );
519  }
520  }
521 
522  return [ $page, $revision ];
523  }
524 }
const CONTENT_MODEL_WIKITEXT
Definition: Defines.php:211
const CONTENT_FORMAT_WIKITEXT
Wikitext.
Definition: Defines.php:227
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
getExtensionData()
Get the extension data as: augmentor name => data.
if(!defined('MW_SETUP_CALLBACK'))
Definition: WebStart.php:88
setCacheRevisionId( $id)
Definition: CacheTime.php:106
setCacheTime( $t)
setCacheTime() sets the timestamp expressing when the page has been rendered.
Definition: CacheTime.php:81
getCacheTime()
Definition: CacheTime.php:67
A Config instance which stores all settings as a member variable.
Definition: HashConfig.php:30
Exception thrown when an unregistered content model is requested.
A class for passing options to services.
PSR-3 logger instance factory.
A class containing constants representing the names of configuration variables.
const ParsoidCacheConfig
Name constant for the ParsoidCacheConfig setting, for use with Config::get()
const ParsoidSettings
Name constant for the ParsoidSettings setting, for use with Config::get()
getRevisionOutputCache(string $name)
Get a RevisionOutputCache instance by $name.
getParserCache(string $name)
Get a ParserCache instance by $name.
Helper class used by MediaWiki to create Parsoid PageConfig objects.
MediaWiki service for getting Parsoid Output objects.
getCachedParserOutputInternal(PageRecord $page, ParserOptions $parserOpts, ?RevisionRecord $revision, bool $isOld, string $statsKey)
parse(PageIdentity $page, ParserOptions $parserOpts, array $parsoidOptions, $revision)
getParsoidRenderID(ParserOutput $parserOutput)
NOTE: This needs to be ParserOutput returned by ->getParserOutput() in this class.
getParserOutput(PageIdentity $page, ParserOptions $parserOpts, $revision=null, int $options=0)
__construct(ServiceOptions $options, ParserCacheFactory $parserCacheFactory, PageLookup $pageLookup, RevisionLookup $revisionLookup, GlobalIdGenerator $globalIdGenerator, IBufferingStatsdDataFactory $stats, Parsoid $parsoid, SiteConfig $siteConfig, PageConfigFactory $parsoidPageConfigFactory, IContentHandlerFactory $contentHandlerFactory)
getCachedParserOutput(PageIdentity $page, ParserOptions $parserOpts, $revision=null)
Represents the identity of a specific rendering of a specific revision at some point in time.
Cache for ParserOutput objects.
Exception representing a failure to look up a revision.
Page revision base class.
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
Cache for ParserOutput objects corresponding to the latest page revisions.
Definition: ParserCache.php:64
Set options of the Parser.
getTargetLanguage()
Target language for the parse.
getExtensionData( $key)
Gets extensions data previously attached to this ParserOutput using setExtensionData().
setExtensionData( $key, $value)
Attaches arbitrary data to this ParserObject.
parse( $text, PageReference $page, ParserOptions $options, $linestart=true, $clearState=true, $revid=null)
Convert wikitext to HTML Do not call this function recursively.
Definition: Parser.php:668
static newFatal( $message,... $parameters)
Factory function for fatal errors.
Definition: StatusValue.php:73
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:85
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition: Status.php:46
Class for getting statistically unique IDs without a central coordinator.
Interface for configuration instances.
Definition: Config.php:30
MediaWiki adaptation of StatsdDataFactory that provides buffering functionality.
Interface for objects (potentially) representing an editable wiki page.
Service for looking up information about wiki pages.
Definition: PageLookup.php:17
Data record representing a page that is (or used to be, or could be) an editable page on a wiki.
Definition: PageRecord.php:24
Service for looking up page revisions.
return true
Definition: router.php:90