Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
51.17% |
240 / 469 |
|
31.34% |
21 / 67 |
CRAP | |
0.00% |
0 / 1 |
SiteConfig | |
51.17% |
240 / 469 |
|
31.34% |
21 / 67 |
3255.88 | |
0.00% |
0 / 1 |
registerExtensionModule | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
unregisterExtensionModule | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getExtensionModules | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getLogger | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
setLogger | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
galleryOptions | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
addHTMLTemplateParameters | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
metrics | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
incrementCounter | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
observeTiming | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
scrubBidiChars | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
allowedExternalImagePrefixes | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
baseURI | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
relativeLinkPrefix | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
bswPagePropRegexp | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
canonicalNamespaceId | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
namespaceId | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
namespaceName | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
namespaceHasSubpages | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
namespaceCase | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
namespaceIsTalk | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
ucfirst | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
specialPageLocalName | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
interwikiMagic | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
magicLinkEnabled | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
interwikiMap | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
interwikiMapNoNamespaces | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
interwikiMatcher | |
100.00% |
53 / 53 |
|
100.00% |
1 / 1 |
15 | |||
iwp | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
legalTitleChars | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
linkPrefixRegex | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
linkTrail | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
linkTrailRegex | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
langBcp47 | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
mainPageLinkTarget | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getMWConfigValue | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
rtl | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
langConverterEnabledBcp47 | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
script | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
scriptpath | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
server | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
exportMetadataToHeadBcp47 | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
exportMetadataHelper | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
42 | |||
redirectRegexp | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
categoryRegexp | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
bswRegexp | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
solTransparentWikitextRegexp | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
6 | |||
solTransparentWikitextNoWsRegexp | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
12 | |||
timezoneOffset | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
variantsFor | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
widthOption | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getVariableIDs | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getMagicWords | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
haveComputedFunctionSynonyms | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFunctionSynonyms | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
updateFunctionSynonym | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
resetMagicWords | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
populateMagicWords | |
95.65% |
22 / 23 |
|
0.00% |
0 / 1 |
9 | |||
mwAliases | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getMagicWordForParserFunction | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
2 | |||
getMagicWordForVariable | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getMagicWordCanonicalName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
20 | |||
getMagicWordForMediaOption | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getMagicWordForBehaviorSwitch | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
isBehaviorSwitch | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getMagicWordWT | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
getMagicWordMatcher | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getParameterizedAliasMatcher | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getMediaPrefixParameterizedAliasMatcher | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getMaxTemplateDepth | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getSpecialNSAliases | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getSpecialPageAliases | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
quoteTitleRe | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getExtResourceURLPatternMatcher | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
7 | |||
linterEnabled | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getLinterSiteConfig | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
makeExtResourceURL | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
8 | |||
getProtocols | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getProtocolsRegex | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
hasValidProtocol | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
findValidProtocol | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
fakeTimestamp | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getNonNativeExtensionTags | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getObjectFactory | |
33.33% |
2 / 6 |
|
0.00% |
0 / 1 |
1.30 | |||
shouldValidateExtConfig | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
constructExtConfig | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
2 | |||
tagNeedsNowikiStrippedInTagPF | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getExtensionModuleSchema | |
60.00% |
3 / 5 |
|
0.00% |
0 / 1 |
2.26 | |||
processExtensionModule | |
56.52% |
39 / 69 |
|
0.00% |
0 / 1 |
40.75 | |||
getExtConfig | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getContentModelHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getAnnotationStrippers | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isExtensionTag | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isAnnotationTag | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getAnnotationTags | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getExtensionTagNameMap | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getExtTagConfig | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getExtTagImpl | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
getPFragmentHandlerKeys | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getPFragmentHandlerConfig | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getPFragmentHandlerImpl | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
getExtDOMProcessors | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getAsyncFallbackMessageKey | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getWt2HtmlLimits | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getHtml2WtLimits | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
createLogger | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
12 | |||
getNoFollowConfig | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getExternalLinkTarget | n/a |
0 / 0 |
n/a |
0 / 0 |
0 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Config; |
5 | |
6 | use JsonSchema\Constraints\Constraint; |
7 | use JsonSchema\Validator; |
8 | use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface; |
9 | use Monolog\Formatter\LineFormatter; |
10 | use Monolog\Handler\ErrorLogHandler; |
11 | use Monolog\Handler\StreamHandler; |
12 | use Monolog\Logger; |
13 | use Psr\Container\ContainerInterface; |
14 | use Psr\Container\NotFoundExceptionInterface; |
15 | use Psr\Log\LoggerInterface; |
16 | use Psr\Log\LogLevel; |
17 | use Psr\Log\NullLogger; |
18 | use Wikimedia\Assert\Assert; |
19 | use Wikimedia\Bcp47Code\Bcp47Code; |
20 | use Wikimedia\ObjectFactory\ObjectFactory; |
21 | use Wikimedia\Parsoid\Core\ContentMetadataCollector; |
22 | use Wikimedia\Parsoid\Core\ContentModelHandler; |
23 | use Wikimedia\Parsoid\Core\LinkTarget; |
24 | use Wikimedia\Parsoid\DOM\Document; |
25 | use Wikimedia\Parsoid\Ext\AnnotationStripper; |
26 | use Wikimedia\Parsoid\Ext\ExtensionModule; |
27 | use Wikimedia\Parsoid\Ext\ExtensionTagHandler; |
28 | use Wikimedia\Parsoid\Ext\Gallery\Gallery; |
29 | use Wikimedia\Parsoid\Ext\Indicator\Indicator; |
30 | use Wikimedia\Parsoid\Ext\JSON\JSON; |
31 | use Wikimedia\Parsoid\Ext\Nowiki\Nowiki; |
32 | use Wikimedia\Parsoid\Ext\PFragmentHandler; |
33 | use Wikimedia\Parsoid\Ext\Pre\Pre; |
34 | use Wikimedia\Parsoid\Fragments\PFragment; |
35 | use Wikimedia\Parsoid\Utils\DOMUtils; |
36 | use Wikimedia\Parsoid\Utils\PHPUtils; |
37 | use Wikimedia\Parsoid\Utils\Utils; |
38 | use Wikimedia\Parsoid\Wikitext\Consts; |
39 | |
40 | /** |
41 | * Site-level configuration interface for Parsoid |
42 | * |
43 | * This includes both global configuration and wiki-level configuration. |
44 | */ |
45 | abstract class SiteConfig { |
46 | /** |
47 | * FIXME: not private so that ParserTests can reset these variables |
48 | * since they reuse site config and other objects between tests for |
49 | * efficiency reasons. |
50 | * |
51 | * @var array|null |
52 | */ |
53 | protected $mwAliases; |
54 | |
55 | /** @var array|null */ |
56 | private $behaviorSwitches; |
57 | |
58 | /** @var array|null */ |
59 | private $variables; |
60 | |
61 | /** @var array|null */ |
62 | private $mediaOptions; |
63 | |
64 | /** |
65 | * @var array{0:array<string,string>,1:array<string,string>} |
66 | * Localized aliases for legacy parser functions. |
67 | */ |
68 | protected array $functionSynonyms = [ [], [], ]; |
69 | |
70 | /** |
71 | * @var array{0:array<string,string>,1:array<string,string>} |
72 | * Localized aliases for parser functions defined with PFragment handlers. |
73 | */ |
74 | protected array $pFragmentHandlerFuncSynonyms = [ [], [] ]; |
75 | |
76 | /** @var string[] */ |
77 | private $protocolsRegexes = []; |
78 | |
79 | /** |
80 | * FIXME: not private so that ParserTests can reset these variables |
81 | * since they reuse site config and other objects between tests for |
82 | * efficiency reasons. |
83 | * @var array|null |
84 | */ |
85 | protected $interwikiMapNoNamespaces; |
86 | |
87 | /** |
88 | * FIXME: not private so that ParserTests can reset these variables |
89 | * since they reuse site config and other objects between tests for |
90 | * efficiency reasons. |
91 | * @var string|null|bool |
92 | */ |
93 | protected $linkTrailRegex = false; |
94 | |
95 | /** |
96 | * These extension modules provide "core" functionality |
97 | * and their implementations live in the Parsoid repo. |
98 | * |
99 | * @var class-string<ExtensionModule>[] |
100 | */ |
101 | private static $coreExtModules = [ |
102 | // content modules |
103 | JSON::class, |
104 | // extension tags |
105 | Nowiki::class, |
106 | Pre::class, |
107 | Gallery::class, |
108 | Indicator::class, |
109 | ]; |
110 | |
111 | /** |
112 | * Array mapping parsoid internal extension ID to ExtensionModule object. |
113 | * @var ?array<int,ExtensionModule> |
114 | */ |
115 | private $extModules = null; |
116 | /** |
117 | * Private counter to assign IDs to $extModules |
118 | * @var int |
119 | */ |
120 | private $extModuleNextId = 0; |
121 | |
122 | // phpcs:disable Generic.Files.LineLength.TooLong |
123 | |
124 | /** |
125 | * Register a Parsoid extension module. |
126 | * @param string|array{name:string}|array{factory:callable}|array{class:class-string<ExtensionModule>} $configOrSpec |
127 | * Either an object factory specification for an ExtensionModule object, |
128 | * or else the configuration array which ExtensionModule::getConfig() |
129 | * would return. (The latter is preferred, but our internal extensions |
130 | * use the former.) |
131 | * @return int An integer identifier that can be passed to |
132 | * ::unregisterExtensionModule to remove this extension ( |
133 | */ |
134 | final public function registerExtensionModule( $configOrSpec ): int { |
135 | $this->getExtensionModules(); // ensure it's initialized w/ core modules |
136 | if ( is_string( $configOrSpec ) || isset( $configOrSpec['class'] ) || isset( $configOrSpec['factory'] ) ) { |
137 | // Treat this as an object factory spec for an ExtensionModule |
138 | // ObjectFactory::createObject accepts an array, not just a callable (phan bug) |
139 | // @phan-suppress-next-line PhanTypeInvalidCallableArraySize |
140 | $module = $this->getObjectFactory()->createObject( $configOrSpec, [ |
141 | 'allowClassName' => true, |
142 | 'assertClass' => ExtensionModule::class, |
143 | ] ); |
144 | } else { |
145 | // Treat this as a configuration array, create a new anonymous |
146 | // ExtensionModule object for it. |
147 | $module = new class( $configOrSpec ) implements ExtensionModule { |
148 | private $config; |
149 | |
150 | /** @param array $config */ |
151 | public function __construct( $config ) { |
152 | $this->config = $config; |
153 | } |
154 | |
155 | /** @inheritDoc */ |
156 | public function getConfig(): array { |
157 | return $this->config; |
158 | } |
159 | }; |
160 | } |
161 | $extId = $this->extModuleNextId++; |
162 | $this->extModules[$extId] = $module; |
163 | // remove cached extConfig to ensure this registration is picked up |
164 | $this->extConfig = null; |
165 | return $extId; |
166 | } |
167 | |
168 | // phpcs:enable Generic.Files.LineLength.TooLong |
169 | |
170 | /** |
171 | * Unregister a Parsoid extension module. This is typically used |
172 | * only for testing purposes in order to reset a shared SiteConfig |
173 | * to its original configuration. |
174 | * @param int $extId The value returned by the call to |
175 | * ::registerExtensionModule() |
176 | */ |
177 | final public function unregisterExtensionModule( int $extId ): void { |
178 | unset( $this->extModules[$extId] ); |
179 | // remove cached extConfig; this will also regenerate |
180 | // magic word synonyms from the still-registered modules. |
181 | $this->extConfig = null; |
182 | } |
183 | |
184 | /** |
185 | * Return the set of Parsoid extension modules associated with this |
186 | * SiteConfig. |
187 | * |
188 | * @return ExtensionModule[] |
189 | */ |
190 | final public function getExtensionModules() { |
191 | if ( $this->extModules === null ) { |
192 | $this->extModules = []; |
193 | foreach ( self::$coreExtModules as $m ) { |
194 | $this->extModules[$this->extModuleNextId++] = new $m(); |
195 | } |
196 | } |
197 | return array_values( $this->extModules ); |
198 | } |
199 | |
200 | /** @var LoggerInterface|null */ |
201 | protected $logger = null; |
202 | |
203 | /** @var int */ |
204 | protected $iwMatcherBatchSize = 4096; |
205 | |
206 | /** @var array|null */ |
207 | protected $iwMatcher = null; |
208 | |
209 | /** @var bool */ |
210 | protected $addHTMLTemplateParameters = false; |
211 | |
212 | /** @var bool */ |
213 | protected $scrubBidiChars = false; |
214 | |
215 | /** @var bool */ |
216 | protected $linterEnabled = false; |
217 | |
218 | /** @var ?array */ |
219 | protected $extConfig = null; |
220 | |
221 | /** |
222 | * Tag handlers for some extensions currently explicit call unstripNowiki |
223 | * first thing in their handlers. They do this to strip <nowiki>..</nowiki> |
224 | * wrappers around args when encountered in the {{#tag:...}} parser function. |
225 | * However, this strategy won't work for Parsoid which calls the preprocessor |
226 | * to get expanded wikitext. In this mode, <nowiki> wrappers won't be stripped |
227 | * and this leads to functional differences in parsing and output. |
228 | * |
229 | * See T203293 and T299103 for more details. |
230 | * |
231 | * To get around this, T299103 proposes that extensions that require this support |
232 | * set a config flag in their Parsoid extension config. On the Parsoid end, we |
233 | * then let the legacy parser know of these tags. When such extension tags are |
234 | * encountered in the {{#tag:...}} parser function handler (see tagObj function |
235 | * in CoreParserFunctions.php), that handler can than automatically strip these |
236 | * nowiki wrappers on behalf of the extension. |
237 | * |
238 | * This serves two purposes. For one, it lets Parsoid support these extensions |
239 | * in this nowiki use edge case. For another, extensions that register handlers |
240 | * with Parsoid can get rid of explicit calls to unstripNowiki() in the |
241 | * tag handlers for the legacy parser. |
242 | * |
243 | * This property maintains an array of tags that need this support. |
244 | * |
245 | * @var array an associative array of tag names |
246 | */ |
247 | private $t299103Tags = []; |
248 | |
249 | /** |
250 | * Base constructor. |
251 | * |
252 | * This constructor is public because it is used to create mock objects |
253 | * in our test suite. |
254 | */ |
255 | public function __construct() { |
256 | } |
257 | |
258 | /************************************************************************//** |
259 | * @name Global config |
260 | * @{ |
261 | */ |
262 | |
263 | /** |
264 | * General log channel |
265 | * @return LoggerInterface |
266 | */ |
267 | public function getLogger(): LoggerInterface { |
268 | if ( $this->logger === null ) { |
269 | $this->logger = new NullLogger; |
270 | } |
271 | return $this->logger; |
272 | } |
273 | |
274 | /** |
275 | * Set the log channel, for debugging |
276 | * @param ?LoggerInterface $logger |
277 | */ |
278 | public function setLogger( ?LoggerInterface $logger ): void { |
279 | $this->logger = $logger; |
280 | } |
281 | |
282 | /** |
283 | * Default gallery options for this wiki. |
284 | * @return array<string,string|int|bool> |
285 | */ |
286 | public function galleryOptions(): array { |
287 | return [ |
288 | 'imagesPerRow' => 0, |
289 | 'imageWidth' => 120, |
290 | 'imageHeight' => 120, |
291 | 'captionLength' => true, |
292 | 'showBytes' => true, |
293 | 'showDimensions' => true, |
294 | 'mode' => 'traditional', |
295 | ]; |
296 | } |
297 | |
298 | /** |
299 | * When processing template parameters, parse them to HTML and add it to the |
300 | * template parameters data. |
301 | * @return bool |
302 | */ |
303 | public function addHTMLTemplateParameters(): bool { |
304 | return $this->addHTMLTemplateParameters; |
305 | } |
306 | |
307 | /** |
308 | * Statistics aggregator, for counting and timing. |
309 | * |
310 | * @return StatsdDataFactoryInterface|null |
311 | */ |
312 | public function metrics(): ?StatsdDataFactoryInterface { |
313 | return null; |
314 | } |
315 | |
316 | /** |
317 | * Increment a counter metric |
318 | * @param string $name |
319 | * @param array $labels |
320 | * @param float $amount |
321 | * @return void |
322 | */ |
323 | abstract public function incrementCounter( string $name, array $labels, float $amount = 1 ); |
324 | |
325 | /** |
326 | * Record a timing metric. |
327 | * |
328 | * Note that the value should be provided in *milliseconds* even though |
329 | * the name of the metric may end (by convention) in `_seconds`. The |
330 | * metrics infrastructure will make the appropriate conversion. |
331 | * |
332 | * @param string $name |
333 | * @param float $value A timing value *in milliseconds* |
334 | * @param array $labels |
335 | * @return void |
336 | */ |
337 | abstract public function observeTiming( string $name, float $value, array $labels ); |
338 | |
339 | /** |
340 | * If enabled, bidi chars adjacent to category links will be stripped |
341 | * in the html -> wt serialization pass. |
342 | * @return bool |
343 | */ |
344 | public function scrubBidiChars(): bool { |
345 | return $this->scrubBidiChars; |
346 | } |
347 | |
348 | /** @} */ |
349 | |
350 | /************************************************************************//** |
351 | * @name Wiki config |
352 | * @{ |
353 | */ |
354 | |
355 | /** |
356 | * Allowed external image URL prefixes. |
357 | * |
358 | * @return string[] The empty array matches no URLs. The empty string matches |
359 | * all URLs. |
360 | */ |
361 | abstract public function allowedExternalImagePrefixes(): array; |
362 | |
363 | /** |
364 | * Site base URI |
365 | * |
366 | * This would be the URI found in `<base href="..." />`. |
367 | * |
368 | * @return string |
369 | */ |
370 | abstract public function baseURI(): string; |
371 | |
372 | /** |
373 | * Prefix for relative links |
374 | * |
375 | * Prefix to prepend to a page title to link to that page. |
376 | * Intended to be relative to the URI returned by baseURI(). |
377 | * |
378 | * If possible, keep the default "./" so clients need not know this value |
379 | * to extract titles from link hrefs. |
380 | * |
381 | * @return string |
382 | */ |
383 | public function relativeLinkPrefix(): string { |
384 | return './'; |
385 | } |
386 | |
387 | /** |
388 | * Regex matching all double-underscore magic words |
389 | * @return string |
390 | */ |
391 | public function bswPagePropRegexp(): string { |
392 | static $bswPagePropRegexp = null; |
393 | if ( $bswPagePropRegexp === null ) { |
394 | $bswRegexp = $this->bswRegexp(); |
395 | $bswPagePropRegexp = |
396 | '@(?:^|\\s)mw:PageProp/(?:' . |
397 | PHPUtils::reStrip( $bswRegexp, '@' ) . |
398 | ')(?=$|\\s)@uDS'; |
399 | } |
400 | return $bswPagePropRegexp; |
401 | } |
402 | |
403 | /** |
404 | * Map a canonical namespace name to its index |
405 | * |
406 | * @note This replaces canonicalNamespaces |
407 | * @param string $name all-lowercase and with underscores rather than spaces. |
408 | * @return int|null |
409 | */ |
410 | abstract public function canonicalNamespaceId( string $name ): ?int; |
411 | |
412 | /** |
413 | * Map a namespace name to its index |
414 | * |
415 | * @note This replaces canonicalNamespaces |
416 | * @param string $name all-lowercase and with underscores rather than spaces. |
417 | * @return int|null |
418 | */ |
419 | abstract public function namespaceId( string $name ): ?int; |
420 | |
421 | /** |
422 | * Map a namespace index to its preferred name |
423 | * (with spaces, not underscores). |
424 | * |
425 | * @note This replaces namespaceNames |
426 | * @param int $ns |
427 | * @return string|null |
428 | */ |
429 | abstract public function namespaceName( int $ns ): ?string; |
430 | |
431 | /** |
432 | * Test if a namespace has subpages |
433 | * |
434 | * @note This replaces namespacesWithSubpages |
435 | * @param int $ns |
436 | * @return bool |
437 | */ |
438 | abstract public function namespaceHasSubpages( int $ns ): bool; |
439 | |
440 | /** |
441 | * Return namespace case setting |
442 | * @param int $ns |
443 | * @return string 'first-letter' or 'case-sensitive' |
444 | */ |
445 | abstract public function namespaceCase( int $ns ): string; |
446 | |
447 | /** |
448 | * Test if a namespace is a talk namespace |
449 | * |
450 | * @note This replaces title.getNamespace().isATalkNamespace() |
451 | * @param int $ns |
452 | * @return bool |
453 | */ |
454 | public function namespaceIsTalk( int $ns ): bool { |
455 | return $ns > 0 && $ns % 2; |
456 | } |
457 | |
458 | /** |
459 | * Uppercasing method for titles |
460 | * @param string $str |
461 | * @return string |
462 | */ |
463 | public function ucfirst( string $str ): string { |
464 | $o = ord( $str ); |
465 | if ( $o < 96 ) { // if already uppercase... |
466 | return $str; |
467 | } elseif ( $o < 128 ) { |
468 | if ( $str[0] === 'i' && |
469 | in_array( $this->langBcp47()->toBcp47Code(), [ 'az', 'tr', 'kaa', 'kk' ], true ) |
470 | ) { |
471 | return 'İ' . mb_substr( $str, 1 ); |
472 | } |
473 | return ucfirst( $str ); // use PHP's ucfirst() |
474 | } else { |
475 | // fall back to more complex logic in case of multibyte strings |
476 | $char = mb_substr( $str, 0, 1 ); |
477 | return mb_strtoupper( $char ) . mb_substr( $str, 1 ); |
478 | } |
479 | } |
480 | |
481 | /** |
482 | * Get the default local name for a special page |
483 | * @param string $alias Special page alias |
484 | * @return string|null |
485 | */ |
486 | abstract public function specialPageLocalName( string $alias ): ?string; |
487 | |
488 | /** |
489 | * Treat language links as magic connectors, not inline links |
490 | * @return bool |
491 | */ |
492 | abstract public function interwikiMagic(): bool; |
493 | |
494 | /** |
495 | * Return true if the specified magic link syntax is enabled on this |
496 | * wiki. |
497 | * @param string $which One of "ISBN", "PMID", or "RFC" |
498 | * @return true if the specified magic link type is enabled on this wiki |
499 | */ |
500 | public function magicLinkEnabled( string $which ): bool { |
501 | // This should be an abstract method, but in order to provide |
502 | // graceful upgrades, start by defaulting to true for all link types |
503 | return true; |
504 | } |
505 | |
506 | /** |
507 | * Interwiki link data. |
508 | * |
509 | * Note that the order of the keys in this array is significant: if more |
510 | * than one prefix matches a given URL during html2wt conversion, the |
511 | * *first* match is used. If you want `wikitech` to be used instead of |
512 | * `labsconsole`, for example, the `'wikitech'=>[....]` key needs to |
513 | * enumerate first. |
514 | * |
515 | * @return array<string,array> Keys are interwiki prefixes, values are arrays with the following keys: |
516 | * - prefix: (string) The interwiki prefix, same as the key. |
517 | * - url: (string) Target URL, containing a '$1' to be replaced by the interwiki target. |
518 | * - protorel: (bool, optional) Whether the url may be accessed by both http:// and https://. |
519 | * - local: (bool, optional) Whether the interwiki link is considered local (to the wikifarm). |
520 | * - localinterwiki: (bool, optional) Whether the interwiki link points to the current wiki. |
521 | * - language: (bool, optional) Whether the interwiki link is a language link. |
522 | * - extralanglink: (bool, optional) Whether the interwiki link is an "extra language link". |
523 | * - linktext: (string, optional) For "extra language links", the link text. |
524 | * (booleans marked "optional" must be omitted if false) |
525 | */ |
526 | abstract public function interwikiMap(): array; |
527 | |
528 | /** |
529 | * Interwiki link data, after removing items that conflict with namespace names. |
530 | * (In case of such conflict, namespace wins, interwiki is ignored.) |
531 | * @return array<string,array> See interwikiMap() |
532 | */ |
533 | public function interwikiMapNoNamespaces(): array { |
534 | if ( $this->interwikiMapNoNamespaces === null ) { |
535 | $this->interwikiMapNoNamespaces = []; |
536 | foreach ( $this->interwikiMap() as $key => $value ) { |
537 | if ( $this->namespaceId( (string)$key ) === null ) { |
538 | $this->interwikiMapNoNamespaces[$key] = $value; |
539 | } |
540 | } |
541 | } |
542 | return $this->interwikiMapNoNamespaces; |
543 | } |
544 | |
545 | /** |
546 | * Match interwiki URLs |
547 | * @param string $href Link to match against |
548 | * @return string[]|null Two values [ string $key, string $target ] on success, null on no match. |
549 | */ |
550 | public function interwikiMatcher( string $href ): ?array { |
551 | if ( $this->iwMatcher === null ) { |
552 | $keys = [ [], [] ]; |
553 | $patterns = [ [], [] ]; |
554 | foreach ( $this->interwikiMapNoNamespaces() as $key => $iw ) { |
555 | $key = (string)$key; |
556 | $lang = (int)( !empty( $iw['language'] ) ); |
557 | |
558 | $url = $iw['url']; |
559 | $protocolRelative = substr( $url, 0, 2 ) === '//'; |
560 | if ( !empty( $iw['protorel'] ) ) { |
561 | $url = preg_replace( '/^https?:/', '', $url ); |
562 | $protocolRelative = true; |
563 | } |
564 | |
565 | // full-url match pattern |
566 | $keys[$lang][] = $key; |
567 | $patterns[$lang][] = |
568 | // Support protocol-relative URLs |
569 | ( $protocolRelative ? '(?:https?:)?' : '' ) |
570 | // Convert placeholder to group match |
571 | . strtr( preg_quote( $url, '/' ), [ '\\$1' => '(.*?)' ] ); |
572 | |
573 | if ( !empty( $iw['local'] ) ) { |
574 | // ./$interwikiPrefix:$title and |
575 | // $interwikiPrefix%3A$title shortcuts |
576 | // are recognized and the local wiki forwards |
577 | // these shortcuts to the remote wiki |
578 | |
579 | $keys[$lang][] = $key; |
580 | $patterns[$lang][] = '^\\.\\/' . $iw['prefix'] . ':(.*?)'; |
581 | |
582 | $keys[$lang][] = $key; |
583 | $patterns[$lang][] = '^' . $iw['prefix'] . '%3A(.*?)'; |
584 | } |
585 | } |
586 | |
587 | // Prefer language matches over non-language matches |
588 | $numLangs = count( $keys[1] ); |
589 | $keys = array_merge( $keys[1], $keys[0] ); |
590 | $patterns = array_merge( $patterns[1], $patterns[0] ); |
591 | |
592 | // Chunk patterns into reasonably sized regexes |
593 | $this->iwMatcher = []; |
594 | $batchStart = 0; |
595 | $batchLen = 0; |
596 | foreach ( $patterns as $i => $pat ) { |
597 | $len = strlen( $pat ); |
598 | if ( $i !== $batchStart && $batchLen + $len > $this->iwMatcherBatchSize ) { |
599 | $this->iwMatcher[] = [ |
600 | array_slice( $keys, $batchStart, $i - $batchStart ), |
601 | '/^(?:' . implode( '|', array_slice( $patterns, $batchStart, $i - $batchStart ) ) . ')$/Di', |
602 | $numLangs - $batchStart, |
603 | ]; |
604 | $batchStart = $i; |
605 | $batchLen = $len; |
606 | } else { |
607 | $batchLen += $len; |
608 | } |
609 | } |
610 | $i = count( $patterns ); |
611 | if ( $i > $batchStart ) { |
612 | $this->iwMatcher[] = [ |
613 | array_slice( $keys, $batchStart, $i - $batchStart ), |
614 | '/^(?:' . implode( '|', array_slice( $patterns, $batchStart, $i - $batchStart ) ) . ')$/Di', |
615 | $numLangs - $batchStart, |
616 | ]; |
617 | } |
618 | } |
619 | |
620 | foreach ( $this->iwMatcher as [ $keys, $regex, $numLangs ] ) { |
621 | if ( preg_match( $regex, $href, $m, PREG_UNMATCHED_AS_NULL ) ) { |
622 | foreach ( $keys as $i => $key ) { |
623 | if ( isset( $m[$i + 1] ) ) { |
624 | if ( $i < $numLangs ) { |
625 | // Escape language interwikis with a colon |
626 | $key = ':' . $key; |
627 | } |
628 | return [ $key, $m[$i + 1] ]; |
629 | } |
630 | } |
631 | } |
632 | } |
633 | return null; |
634 | } |
635 | |
636 | /** |
637 | * Wiki identifier, for cache keys. |
638 | * Should match a key in mwApiMap()? |
639 | * @return string |
640 | */ |
641 | abstract public function iwp(): string; |
642 | |
643 | /** |
644 | * Legal title characters |
645 | * |
646 | * Regex is intended to match bytes, not Unicode characters. |
647 | * |
648 | * @return string Regex character class (i.e. the bit that goes inside `[]`) |
649 | */ |
650 | abstract public function legalTitleChars(): string; |
651 | |
652 | /** |
653 | * Link prefix regular expression. |
654 | * @return string|null |
655 | */ |
656 | abstract public function linkPrefixRegex(): ?string; |
657 | |
658 | /** |
659 | * Return raw link trail regexp from config |
660 | * @return string |
661 | */ |
662 | abstract protected function linkTrail(): string; |
663 | |
664 | /** |
665 | * Link trail regular expression. |
666 | * @return string|null |
667 | */ |
668 | public function linkTrailRegex(): ?string { |
669 | if ( $this->linkTrailRegex === false ) { |
670 | $trail = $this->linkTrail(); |
671 | $trail = str_replace( '(.*)$', '', $trail ); |
672 | if ( strpos( $trail, '()' ) !== false ) { |
673 | // Empty regex from zh-hans |
674 | $this->linkTrailRegex = null; |
675 | } else { |
676 | $this->linkTrailRegex = $trail; |
677 | } |
678 | } |
679 | return $this->linkTrailRegex; |
680 | } |
681 | |
682 | /** |
683 | * Wiki language code. |
684 | * @return Bcp47Code BCP-47 language code |
685 | */ |
686 | abstract public function langBcp47(): Bcp47Code; |
687 | |
688 | /** |
689 | * Main page title, as LinkTarget |
690 | * @return LinkTarget |
691 | */ |
692 | abstract public function mainPageLinkTarget(): LinkTarget; |
693 | |
694 | /** |
695 | * Lookup config |
696 | * @param string $key |
697 | * @return mixed|null config value for $key, if present or null, if not. |
698 | * @deprecated This very broad interface is no longer needed. |
699 | */ |
700 | abstract public function getMWConfigValue( string $key ); |
701 | |
702 | /** |
703 | * Whether the wiki language is right-to-left |
704 | * @return bool |
705 | */ |
706 | abstract public function rtl(): bool; |
707 | |
708 | /** |
709 | * Whether language converter is enabled for the specified language |
710 | * @param Bcp47Code $lang |
711 | * @return bool |
712 | */ |
713 | abstract public function langConverterEnabledBcp47( Bcp47Code $lang ): bool; |
714 | |
715 | /** |
716 | * The URL path to index.php. |
717 | * @return string |
718 | */ |
719 | abstract public function script(): string; |
720 | |
721 | /** |
722 | * FIXME: This is only used to compute the modules path below |
723 | * and maybe shouldn't be exposed. |
724 | * |
725 | * The base wiki path |
726 | * @return string |
727 | */ |
728 | abstract public function scriptpath(): string; |
729 | |
730 | /** |
731 | * The base URL of the server. |
732 | * @return string |
733 | */ |
734 | abstract public function server(): string; |
735 | |
736 | /** |
737 | * Export content metadata via meta tags (and via a stylesheet |
738 | * for now to aid some clients). |
739 | * |
740 | * @param Document $document |
741 | * @param ContentMetadataCollector $metadata |
742 | * @param string $defaultTitle The default title to display, as an |
743 | * unescaped string |
744 | * @param Bcp47Code $lang a BCP-47 language code |
745 | */ |
746 | abstract public function exportMetadataToHeadBcp47( |
747 | Document $document, |
748 | ContentMetadataCollector $metadata, |
749 | string $defaultTitle, |
750 | Bcp47Code $lang |
751 | ): void; |
752 | |
753 | /** |
754 | * Helper function to create <head> elements from metadata. |
755 | * @param Document $document |
756 | * @param string $modulesLoadURI |
757 | * @param string[] $modules |
758 | * @param string[] $moduleStyles |
759 | * @param array<string,mixed> $jsConfigVars |
760 | * @param string $htmlTitle The display title, as escaped HTML |
761 | * @param Bcp47Code $lang a Bcp47Code object |
762 | */ |
763 | protected function exportMetadataHelper( |
764 | Document $document, |
765 | string $modulesLoadURI, |
766 | array $modules, |
767 | array $moduleStyles, |
768 | array $jsConfigVars, |
769 | string $htmlTitle, |
770 | Bcp47Code $lang |
771 | ): void { |
772 | // $htmlTitle contains the DISPLAYTITLE but it corresponds to the |
773 | // value of the ParserOutput *not* the ultimate value which would |
774 | // be used in the <h1> tag *nor* the plaintext value which would |
775 | // be used for the page <title>. OutputPage does additional |
776 | // validation/stripping on the displaytitle value before using it. |
777 | // As such we're going to just ignore $htmlTitle for now rather |
778 | // than report an incorrect value in the <head> (T324431). |
779 | |
780 | // JsConfigVars |
781 | $content = null; |
782 | try { |
783 | if ( $jsConfigVars ) { |
784 | $content = PHPUtils::jsonEncode( $jsConfigVars ); |
785 | } |
786 | } catch ( \Exception $e ) { |
787 | // Similar to ResourceLoader::makeConfigSetScript. See T289358 |
788 | $this->getLogger()->log( |
789 | LogLevel::WARNING, |
790 | 'JSON serialization of config data failed. ' . |
791 | 'This usually means the config data is not valid UTF-8.' |
792 | ); |
793 | } |
794 | if ( $content ) { |
795 | DOMUtils::appendToHead( $document, 'meta', [ |
796 | 'property' => 'mw:jsConfigVars', |
797 | 'content' => $content, |
798 | ] ); |
799 | } |
800 | // Styles from modules returned from preprocessor / parse requests |
801 | if ( $modules ) { |
802 | // mw:generalModules can be processed via JS (and async) and are usually (but |
803 | // not always) JS scripts. |
804 | DOMUtils::appendToHead( $document, 'meta', [ |
805 | 'property' => 'mw:generalModules', |
806 | 'content' => implode( '|', array_unique( $modules ) ) |
807 | ] ); |
808 | } |
809 | // Styles from modules returned from preprocessor / parse requests |
810 | if ( $moduleStyles ) { |
811 | // mw:moduleStyles are CSS modules that are render-blocking. |
812 | DOMUtils::appendToHead( $document, 'meta', [ |
813 | 'property' => 'mw:moduleStyles', |
814 | 'content' => implode( '|', array_unique( $moduleStyles ) ) |
815 | ] ); |
816 | } |
817 | /* |
818 | * While unnecessary for Wikimedia clients, a stylesheet url in |
819 | * the <head> is useful for clients like Kiwix and others who |
820 | * might not want to process the meta tags to construct the |
821 | * resourceloader url. |
822 | * |
823 | * Given that these clients will be consuming Parsoid HTML outside |
824 | * a MediaWiki skin, the clients are effectively responsible for |
825 | * their own "skin". But, once again, as a courtesy, we are |
826 | * hardcoding the vector skin modules for them. But, note that |
827 | * this may cause page elements to render differently than how |
828 | * they render on Wikimedia sites with the vector skin since this |
829 | * is probably missing a number of other modules. |
830 | * |
831 | * All that said, note that JS-generated parts of the page will |
832 | * still require them to have more intimate knowledge of how to |
833 | * process the JS modules. Except for <graph>s, page content |
834 | * doesn't require JS modules at this point. So, where these |
835 | * clients want to invest in the necessary logic to construct a |
836 | * better resourceloader url, they could simply delete / ignore |
837 | * this stylesheet. |
838 | */ |
839 | $moreStyles = array_merge( $moduleStyles, [ |
840 | 'mediawiki.skinning.content.parsoid', |
841 | // Use the base styles that API output and fallback skin use. |
842 | 'mediawiki.skinning.interface', |
843 | // Make sure to include contents of user generated styles |
844 | // e.g. MediaWiki:Common.css / MediaWiki:Mobile.css |
845 | 'site.styles' |
846 | ] ); |
847 | # need to use MW-internal language code for constructing resource |
848 | # loader path. |
849 | $langMw = Utils::bcp47ToMwCode( $lang ); |
850 | $styleURI = $modulesLoadURI . '?lang=' . $langMw . '&modules=' . |
851 | PHPUtils::encodeURIComponent( implode( '|', array_unique( $moreStyles ) ) ) . |
852 | '&only=styles&skin=vector'; |
853 | DOMUtils::appendToHead( $document, 'link', [ 'rel' => 'stylesheet', 'href' => $styleURI ] ); |
854 | } |
855 | |
856 | /** |
857 | * A regexp matching the localized 'REDIRECT' marker for this wiki. |
858 | * The regexp should be delimited, but should not have boundary anchors |
859 | * or capture groups. |
860 | * @return string |
861 | */ |
862 | abstract public function redirectRegexp(): string; |
863 | |
864 | /** |
865 | * A regexp matching the localized 'Category' prefix for this wiki. |
866 | * The regexp should be delimited, but should not have boundary anchors |
867 | * or capture groups. |
868 | * @return string |
869 | */ |
870 | abstract public function categoryRegexp(): string; |
871 | |
872 | /** |
873 | * A regexp matching localized behavior switches for this wiki. |
874 | * The regexp should be delimited, but should not have boundary anchors |
875 | * or capture groups. |
876 | * @return string |
877 | */ |
878 | abstract public function bswRegexp(): string; |
879 | |
880 | /** |
881 | * A regex matching a line containing just whitespace, comments, and |
882 | * sol transparent links and behavior switches. |
883 | * @return string |
884 | */ |
885 | public function solTransparentWikitextRegexp(): string { |
886 | // cscott sadly says: Note that this depends on the precise |
887 | // localization of the magic words of this particular wiki. |
888 | static $solTransparentWikitextRegexp = null; |
889 | if ( $solTransparentWikitextRegexp === null ) { |
890 | $redirect = PHPUtils::reStrip( $this->redirectRegexp(), '@' ); |
891 | $category = PHPUtils::reStrip( $this->categoryRegexp(), '@' ); |
892 | $bswRegexp = PHPUtils::reStrip( $this->bswRegexp(), '@' ); |
893 | $comment = PHPUtils::reStrip( Utils::COMMENT_REGEXP, '@' ); |
894 | $solTransparentWikitextRegexp = '@' . |
895 | '^[ \t\n\r\0\x0b]*' . |
896 | '(?:' . |
897 | '(?:' . $redirect . ')' . |
898 | '[ \t\n\r\x0c]*(?::[ \t\n\r\x0c]*)?\[\[[^\]]+\]\]' . |
899 | ')?' . |
900 | '(?:' . |
901 | '\[\[' . $category . '\:[^\]]*?\]\]|' . |
902 | '__(?:' . $bswRegexp . ')__|' . |
903 | $comment . '|' . |
904 | '[ \t\n\r\0\x0b]' . |
905 | ')*$@'; |
906 | } |
907 | return $solTransparentWikitextRegexp; |
908 | } |
909 | |
910 | /** |
911 | * A regex matching a line containing just comments and |
912 | * sol transparent links and behavior switches. |
913 | * |
914 | * @param bool $addIncludes |
915 | * @return string |
916 | */ |
917 | public function solTransparentWikitextNoWsRegexp( |
918 | bool $addIncludes = false |
919 | ): string { |
920 | // cscott sadly says: Note that this depends on the precise |
921 | // localization of the magic words of this particular wiki. |
922 | static $solTransparentWikitextNoWsRegexp = null; |
923 | if ( $solTransparentWikitextNoWsRegexp === null ) { |
924 | $redirect = PHPUtils::reStrip( $this->redirectRegexp(), '@' ); |
925 | $category = PHPUtils::reStrip( $this->categoryRegexp(), '@' ); |
926 | $bswRegexp = PHPUtils::reStrip( $this->bswRegexp(), '@' ); |
927 | $comment = PHPUtils::reStrip( Utils::COMMENT_REGEXP, '@' ); |
928 | $solTransparentWikitextNoWsRegexp = '@' . |
929 | '((?:' . |
930 | '(?:' . $redirect . ')' . |
931 | '[ \t\n\r\x0c]*(?::[ \t\n\r\x0c]*)?\[\[[^\]]+\]\]' . |
932 | ')?' . |
933 | '(?:' . |
934 | '\[\[' . $category . '\:[^\]]*?\]\]|' . |
935 | '__(?:' . $bswRegexp . ')__|' . |
936 | $comment . |
937 | // FIXME(SSS): What about onlyinclude and noinclude? |
938 | ( $addIncludes ? '|<includeonly>[\S\s]*?</includeonly>' : '' ) . |
939 | ')*)@'; |
940 | } |
941 | return $solTransparentWikitextNoWsRegexp; |
942 | } |
943 | |
944 | /** |
945 | * The wiki's time zone offset |
946 | * @return int Minutes east of UTC |
947 | */ |
948 | abstract public function timezoneOffset(): int; |
949 | |
950 | /** |
951 | * Language variant information for the given language (or null if |
952 | * unknown). |
953 | * @param Bcp47Code $lang The language for which you want variant information |
954 | * @return ?array{base:Bcp47Code,fallbacks:Bcp47Code[]} an array with |
955 | * two fields: |
956 | * - base: (Bcp47Code) Base BCP-47 language code (e.g. "zh") |
957 | * - fallbacks: (Bcp47Code[]) Fallback variants, as BCP-47 codes |
958 | */ |
959 | abstract public function variantsFor( Bcp47Code $lang ): ?array; |
960 | |
961 | /** |
962 | * Default thumbnail width |
963 | */ |
964 | abstract public function widthOption(): int; |
965 | |
966 | abstract protected function getVariableIDs(): array; |
967 | |
968 | abstract protected function getMagicWords(): array; |
969 | |
970 | /** |
971 | * Does the SiteConfig provide precomputed function synonyms? |
972 | * If no, the SiteConfig is expected to provide an implementation |
973 | * for updateFunctionSynonym. |
974 | */ |
975 | protected function haveComputedFunctionSynonyms(): bool { |
976 | return true; |
977 | } |
978 | |
979 | /** |
980 | * Get a list of precomputed synonyms for parser functions registered |
981 | * with the legacy parser. Be aware that this is distinct from the |
982 | * set of parser functions with Parsoid-native implementations! |
983 | * @return array{0:array<string,string>,1:array<string,string>} |
984 | */ |
985 | protected function getFunctionSynonyms(): array { |
986 | return [ [], [], ]; |
987 | } |
988 | |
989 | /** |
990 | * If ::haveComputedFunctionSynoyms() returns false, this function is |
991 | * called once on every magic word alias. This function is responsible |
992 | * for determining if the magic word key ($magicword) corresponds to a |
993 | * registered legacy parser function (list obtained via |
994 | * `action=query&meta=siteinfo&siprop=functionhooks`) and setting |
995 | * `$this->functionSynonyms[$case][$alias] = $magicword` if so. |
996 | * |
997 | * @param string $func A localized aliases for this magic word |
998 | * @param string $magicword The lookup key for this magic word |
999 | * @param bool $caseSensitive If true, $func is to be treated as |
1000 | * case-sensitive. |
1001 | */ |
1002 | protected function updateFunctionSynonym( string $func, string $magicword, bool $caseSensitive ): void { |
1003 | throw new \RuntimeException( "Unexpected code path!" ); |
1004 | } |
1005 | |
1006 | /** |
1007 | * Reset our cached magic word lookup tables. |
1008 | * |
1009 | * This function is intended to be used by parser tests to |
1010 | * re-compute magic words, behavior switches, lists of magic |
1011 | * variables, etc after processing test-specific settings. |
1012 | * @internal |
1013 | */ |
1014 | public function resetMagicWords() { |
1015 | $this->mwAliases = null; |
1016 | } |
1017 | |
1018 | private function populateMagicWords() { |
1019 | if ( !empty( $this->mwAliases ) ) { |
1020 | return; |
1021 | } |
1022 | |
1023 | $this->mwAliases = $this->behaviorSwitches = $this->variables = $this->mediaOptions = []; |
1024 | $variablesMap = PHPUtils::makeSet( $this->getVariableIDs() ); |
1025 | $this->functionSynonyms = $this->getFunctionSynonyms(); |
1026 | $haveSynonyms = $this->haveComputedFunctionSynonyms(); |
1027 | foreach ( $this->getMagicWords() as $magicword => $aliases ) { |
1028 | $caseSensitive = array_shift( $aliases ); |
1029 | $isVariable = isset( $variablesMap[$magicword] ); |
1030 | $isMediaOption = preg_match( '/^(img|timedmedia)_/', $magicword ); |
1031 | foreach ( $aliases as $alias ) { |
1032 | $this->mwAliases[$magicword][] = $alias; |
1033 | if ( !$caseSensitive ) { |
1034 | // T389029: strtolower is not the same as case-folding |
1035 | $alias = mb_strtolower( $alias ); |
1036 | $this->mwAliases[$magicword][] = $alias; |
1037 | } |
1038 | if ( substr( $alias, 0, 2 ) === '__' ) { |
1039 | $this->behaviorSwitches[$alias] = [ $caseSensitive, $magicword ]; |
1040 | } |
1041 | if ( $isVariable ) { |
1042 | $this->variables[$alias] = $magicword; |
1043 | } |
1044 | if ( $isMediaOption ) { |
1045 | $this->mediaOptions[$alias] = [ $caseSensitive, $magicword ]; |
1046 | } |
1047 | if ( !$haveSynonyms ) { |
1048 | $this->updateFunctionSynonym( $alias, $magicword, (bool)$caseSensitive ); |
1049 | } |
1050 | } |
1051 | } |
1052 | } |
1053 | |
1054 | /** |
1055 | * List all magic words by canonical name |
1056 | * @return string[][] Keys are canonical names, values are arrays of aliases. |
1057 | */ |
1058 | public function mwAliases(): array { |
1059 | $this->populateMagicWords(); |
1060 | return $this->mwAliases; |
1061 | } |
1062 | |
1063 | /** |
1064 | * Return canonical magic word for a parser function |
1065 | * @param string $str A localized potential parser function name, including |
1066 | * any leading `#` (but not a trailing colon or bar) |
1067 | * @return array{key:?string,isNative:bool} |
1068 | * The magic word "key" for this parser function and a boolean |
1069 | * indicating whether this is a parsoid-native PFragment handler |
1070 | * (true) or a parser function handled by the legacy parser |
1071 | * fallback (false). The key is `null` if no parser function |
1072 | * matching $str is known. |
1073 | */ |
1074 | public function getMagicWordForParserFunction( string $str ): array { |
1075 | # Case insensitive functions: |
1076 | # Core uses $parser->contLang->lc($str) which is optimized but |
1077 | # equivalent to mb_strtolower; case-insensitivity for parser |
1078 | # function names should be deprecated, though, and converting |
1079 | # to lower case doesn't actually yield a case-insensitive match |
1080 | # (T389029) |
1081 | $lower = mb_strtolower( $str ); |
1082 | |
1083 | # Native implementations take precedence |
1084 | $isNative = true; |
1085 | $this->getExtConfig(); |
1086 | $key = $this->pFragmentHandlerFuncSynonyms[1][$str] ?? |
1087 | $this->pFragmentHandlerFuncSynonyms[0][$lower] ?? null; |
1088 | if ( $key === null ) { |
1089 | # Legacy parser functions |
1090 | $isNative = false; |
1091 | $this->populateMagicWords(); |
1092 | $key = $this->functionSynonyms[1][$str] ?? |
1093 | $this->functionSynonyms[0][$lower] ?? null; |
1094 | } |
1095 | return [ 'key' => $key, 'isNative' => $isNative ]; |
1096 | } |
1097 | |
1098 | /** |
1099 | * Return canonical magic word for a variable |
1100 | * @param string $str |
1101 | * @return string|null |
1102 | */ |
1103 | public function getMagicWordForVariable( string $str ): ?string { |
1104 | $this->populateMagicWords(); |
1105 | return $this->variables[$str] ?? null; |
1106 | } |
1107 | |
1108 | private static function getMagicWordCanonicalName( array $mws, string $word ): ?string { |
1109 | if ( isset( $mws[$word] ) ) { |
1110 | return $mws[$word][1]; |
1111 | } |
1112 | $mw = $mws[mb_strtolower( $word )] ?? null; |
1113 | return ( $mw && !$mw[0] ) ? $mw[1] : null; |
1114 | } |
1115 | |
1116 | /** |
1117 | * Return canonical magic word for a media option |
1118 | * @param string $word |
1119 | * @return string|null |
1120 | */ |
1121 | public function getMagicWordForMediaOption( string $word ): ?string { |
1122 | $this->populateMagicWords(); |
1123 | return self::getMagicWordCanonicalName( $this->mediaOptions, $word ); |
1124 | } |
1125 | |
1126 | /** |
1127 | * Return canonical magic word for a behavior switch |
1128 | * @param string $word |
1129 | * @return string|null |
1130 | */ |
1131 | public function getMagicWordForBehaviorSwitch( string $word ): ?string { |
1132 | $this->populateMagicWords(); |
1133 | return self::getMagicWordCanonicalName( $this->behaviorSwitches, $word ); |
1134 | } |
1135 | |
1136 | /** |
1137 | * Check if a string is a recognized behavior switch. |
1138 | * |
1139 | * @param string $word |
1140 | * @return bool |
1141 | */ |
1142 | public function isBehaviorSwitch( string $word ): bool { |
1143 | return $this->getMagicWordForBehaviorSwitch( $word ) !== null; |
1144 | } |
1145 | |
1146 | /** |
1147 | * Convert the internal canonical magic word name to the wikitext alias. |
1148 | * @param string $word Canonical magic word name |
1149 | * @param string $suggest Suggested alias (used as fallback and preferred choice) |
1150 | * @return string |
1151 | */ |
1152 | public function getMagicWordWT( string $word, string $suggest ): string { |
1153 | $aliases = $this->mwAliases()[$word] ?? null; |
1154 | if ( !$aliases ) { |
1155 | return $suggest; |
1156 | } |
1157 | $ind = 0; |
1158 | if ( $suggest ) { |
1159 | $ind = array_search( $suggest, $aliases, true ); |
1160 | } |
1161 | return $aliases[$ind ?: 0]; |
1162 | } |
1163 | |
1164 | /** |
1165 | * Get a regexp matching a localized magic word, given its id. |
1166 | * |
1167 | * FIXME: misleading function name |
1168 | * |
1169 | * @param string $id |
1170 | * @return string |
1171 | */ |
1172 | abstract public function getMagicWordMatcher( string $id ): string; |
1173 | |
1174 | /** |
1175 | * Get a matcher function for fetching values out of interpolated magic words, |
1176 | * ie those with `$1` in their aliases. |
1177 | * |
1178 | * The matcher takes a string and returns null if it doesn't match any of |
1179 | * the words, or an associative array if it did match: |
1180 | * - k: The magic word that matched |
1181 | * - v: The value of $1 that was matched |
1182 | * (the JS also returned 'a' with the specific alias that matched, but that |
1183 | * seems to be unused and so is omitted here) |
1184 | * |
1185 | * @param string[] $words Magic words to match |
1186 | * @return callable |
1187 | */ |
1188 | abstract protected function getParameterizedAliasMatcher( array $words ): callable; |
1189 | |
1190 | /** |
1191 | * Get a matcher function for fetching values out of interpolated magic words |
1192 | * which are media prefix options. |
1193 | * |
1194 | * The matcher takes a string and returns null if it doesn't match any of |
1195 | * the words, or an associative array if it did match: |
1196 | * - k: The magic word that matched |
1197 | * - v: The value of $1 that was matched |
1198 | * (the JS also returned 'a' with the specific alias that matched, but that |
1199 | * seems to be unused and so is omitted here) |
1200 | * |
1201 | * @return callable |
1202 | */ |
1203 | final public function getMediaPrefixParameterizedAliasMatcher(): callable { |
1204 | // PORT-FIXME: this shouldn't be a constant, we should fetch these |
1205 | // from the SiteConfig. Further, we probably need a hook here so |
1206 | // Parsoid can handle media options defined in extensions... in |
1207 | // particular timedmedia_* magic words from Extension:TimedMediaHandler |
1208 | $mws = array_keys( Consts::$Media['PrefixOptions'] ); |
1209 | return $this->getParameterizedAliasMatcher( $mws ); |
1210 | } |
1211 | |
1212 | /** |
1213 | * Get the maximum template depth |
1214 | * |
1215 | * @return int |
1216 | */ |
1217 | abstract public function getMaxTemplateDepth(): int; |
1218 | |
1219 | /** |
1220 | * Return name spaces aliases for the NS_SPECIAL namespace |
1221 | * @return array |
1222 | */ |
1223 | abstract protected function getSpecialNSAliases(): array; |
1224 | |
1225 | /** |
1226 | * Return Special Page aliases for a special page name |
1227 | * @param string $specialPage |
1228 | * @return array |
1229 | */ |
1230 | abstract protected function getSpecialPageAliases( string $specialPage ): array; |
1231 | |
1232 | /** |
1233 | * Quote a title regex |
1234 | * |
1235 | * Assumes '/' as the delimiter, and replaces spaces or underscores with |
1236 | * `[ _]` so either will be matched. |
1237 | * |
1238 | * @param string $s |
1239 | * @param string $delimiter Defaults to '/' |
1240 | * @return string |
1241 | */ |
1242 | protected static function quoteTitleRe( string $s, string $delimiter = '/' ): string { |
1243 | $s = preg_quote( $s, $delimiter ); |
1244 | $s = strtr( $s, [ |
1245 | ' ' => '[ _]', |
1246 | '_' => '[ _]', |
1247 | ] ); |
1248 | return $s; |
1249 | } |
1250 | |
1251 | /** |
1252 | * Matcher for ISBN/RFC/PMID URL patterns, returning the type and number. |
1253 | * |
1254 | * The match method takes a string and returns false on no match or a tuple |
1255 | * like this on match: [ 'RFC', '12345' ] |
1256 | * |
1257 | * @return callable |
1258 | */ |
1259 | public function getExtResourceURLPatternMatcher(): callable { |
1260 | $nsAliases = implode( '|', array_unique( $this->getSpecialNSAliases() ) ); |
1261 | $pageAliases = implode( '|', array_map( [ $this, 'quoteTitleRe' ], |
1262 | $this->getSpecialPageAliases( 'Booksources' ) |
1263 | ) ); |
1264 | |
1265 | $pats = [ |
1266 | 'ISBN' => '(?:\.\.?/)*(?i:' . $nsAliases . ')(?:%3[Aa]|:)' |
1267 | . '(?i:' . $pageAliases . ')(?:%2[Ff]|/)(?P<ISBN>\d+[Xx]?)', |
1268 | // Recently the target url for RFCs changed from |
1269 | // tools.ietf.org to datatracker.ietf.org/docs. |
1270 | // Given edit stash storage on Wikimedia wikis, we need to retain the |
1271 | // old mapping to ensure html->wt can handle that HTML properly |
1272 | // But, 3rd party wikis with Parsoid HTML in their caches will also |
1273 | // need this b/c support for much longer. Once the MW LTS release with |
1274 | // tools.ietf.org EOLs, we can remove the tools.ietf.org string here. |
1275 | // T382963 tracks the eventual removal of this b/c. |
1276 | 'RFC' => '[^/]*//(?:datatracker\.ietf\.org/doc|tools\.ietf\.org)/html/rfc(?P<RFC>\w+)', |
1277 | 'PMID' => '[^/]*//www\.ncbi\.nlm\.nih\.gov/pubmed/(?P<PMID>\w+)\?dopt=Abstract', |
1278 | ]; |
1279 | // T145590: remove patterns for disabled magic links |
1280 | foreach ( array_keys( $pats ) as $v ) { |
1281 | if ( !$this->magicLinkEnabled( $v ) ) { |
1282 | unset( $pats[$v] ); |
1283 | } |
1284 | } |
1285 | $regex = '!^(?:' . implode( '|', $pats ) . ')$!'; |
1286 | return static function ( $text ) use ( $pats, $regex ) { |
1287 | if ( preg_match( $regex, $text, $m ) ) { |
1288 | foreach ( $pats as $k => $re ) { |
1289 | if ( isset( $m[$k] ) && $m[$k] !== '' ) { |
1290 | return [ $k, $m[$k] ]; |
1291 | } |
1292 | } |
1293 | } |
1294 | return false; |
1295 | }; |
1296 | } |
1297 | |
1298 | /** |
1299 | * @return bool |
1300 | */ |
1301 | public function linterEnabled(): bool { |
1302 | return $this->linterEnabled; |
1303 | } |
1304 | |
1305 | /** |
1306 | * Return the desired linter configuration. These are heuristic values |
1307 | * which have hardcoded defaults but could be overridden on a per-wiki |
1308 | * basis. |
1309 | * @return array{enabled?:?string[],disabled?:?string[],maxTableColumnHeuristic?:int,maxTableRowsToCheck?:int} |
1310 | */ |
1311 | public function getLinterSiteConfig(): array { |
1312 | return [ |
1313 | // Allow list for specific lint types. |
1314 | // Takes precedence over block list. |
1315 | 'enabled' => null, |
1316 | // Block list for specific lint types. |
1317 | // Not used if an allow list is set. |
1318 | 'disabled' => null, |
1319 | // The maximum columns in a table before the table is considered |
1320 | // large |
1321 | 'maxTableColumnHeuristic' => 5, |
1322 | // The maximum rows (header or data) to be checked for the large |
1323 | // table lint |
1324 | // - If we consider the first N rows to be representative of the |
1325 | // table, and the table is well-formed and uniform, it is |
1326 | // sufficent to check the first N rows to check if the table is |
1327 | // "large". |
1328 | // - This heuristic is used together with the |
1329 | // 'maxTableColumnHeuristic' to identify "large tables". |
1330 | 'maxTableRowsToCheck' => 10, |
1331 | // Max length of content covered by 'white-space:nowrap' CSS |
1332 | // that we consider "safe" when Tidy is replaced. Beyond that, |
1333 | // wikitext will have to be fixed up to manually insert whitespace |
1334 | // at the right places. Length in bytes. |
1335 | 'tidyWhitespaceBugMaxLength' => 100, |
1336 | ]; |
1337 | } |
1338 | |
1339 | /** |
1340 | * Serialize ISBN/RFC/PMID URL patterns |
1341 | * |
1342 | * @param string[] $match As returned by the getExtResourceURLPatternMatcher() matcher |
1343 | * @param string $href Fallback link target, if $match is invalid. |
1344 | * @param string $content Link text |
1345 | * @return string |
1346 | */ |
1347 | public function makeExtResourceURL( array $match, string $href, string $content ): string { |
1348 | $normalized = preg_replace( |
1349 | '/[ \x{00A0}\x{1680}\x{2000}-\x{200A}\x{202F}\x{205F}\x{3000}]+/u', ' ', |
1350 | Utils::decodeWtEntities( $content ) |
1351 | ); |
1352 | |
1353 | // TODO: T145590 ("Update Parsoid to be compatible with magic links being disabled") |
1354 | switch ( $match[0] ) { |
1355 | case 'ISBN': |
1356 | $normalized = strtoupper( preg_replace( '/[\- \t]/', '', $normalized ) ); |
1357 | // validate ISBN length and format, so as not to produce magic links |
1358 | // which aren't actually magic |
1359 | $valid = preg_match( '/^ISBN(97[89])?\d{9}(\d|X)$/D', $normalized ); |
1360 | if ( implode( '', $match ) === $normalized && $valid ) { |
1361 | return $content; |
1362 | } |
1363 | // strip "./" prefix. TODO: Use relativeLinkPrefix() instead? |
1364 | $href = PHPUtils::stripPrefix( $href, './' ); |
1365 | return "[[$href|$content]]"; |
1366 | |
1367 | case 'RFC': |
1368 | case 'PMID': |
1369 | $normalized = preg_replace( '/[ \t]/', '', $normalized ); |
1370 | return implode( '', $match ) === $normalized ? $content : "[$href $content]"; |
1371 | |
1372 | default: |
1373 | throw new \InvalidArgumentException( "Invalid match type '{$match[0]}'" ); |
1374 | } |
1375 | } |
1376 | |
1377 | /** |
1378 | * Get the list of valid protocols |
1379 | * @return array |
1380 | */ |
1381 | abstract protected function getProtocols(): array; |
1382 | |
1383 | /** |
1384 | * Get a regex fragment matching URL protocols, quoted for an exclamation |
1385 | * mark delimiter. The case-insensitive option should be used. |
1386 | * |
1387 | * @param bool $excludeProtRel Whether to exclude protocol-relative URLs |
1388 | * @return string |
1389 | */ |
1390 | public function getProtocolsRegex( bool $excludeProtRel = false ) { |
1391 | $excludeProtRel = (int)$excludeProtRel; |
1392 | if ( !isset( $this->protocolsRegexes[$excludeProtRel] ) ) { |
1393 | $parts = []; |
1394 | foreach ( $this->getProtocols() as $protocol ) { |
1395 | if ( !$excludeProtRel || $protocol !== '//' ) { |
1396 | $parts[] = preg_quote( $protocol, '!' ); |
1397 | } |
1398 | } |
1399 | $this->protocolsRegexes[$excludeProtRel] = implode( '|', $parts ); |
1400 | } |
1401 | return $this->protocolsRegexes[$excludeProtRel]; |
1402 | } |
1403 | |
1404 | /** |
1405 | * Matcher for valid protocols, must be anchored at start of string. |
1406 | * @param string $potentialLink |
1407 | * @return bool Whether $potentialLink begins with a valid protocol |
1408 | */ |
1409 | public function hasValidProtocol( string $potentialLink ): bool { |
1410 | $re = '!^(?:' . $this->getProtocolsRegex() . ')!i'; |
1411 | return (bool)preg_match( $re, $potentialLink ); |
1412 | } |
1413 | |
1414 | /** |
1415 | * Matcher for valid protocols, may occur at any point within string. |
1416 | * @param string $potentialLink |
1417 | * @return bool Whether $potentialLink contains a valid protocol |
1418 | */ |
1419 | public function findValidProtocol( string $potentialLink ): bool { |
1420 | $re = '!(?:\W|^)(?:' . $this->getProtocolsRegex() . ')!i'; |
1421 | return (bool)preg_match( $re, $potentialLink ); |
1422 | } |
1423 | |
1424 | /** @} */ |
1425 | |
1426 | /** |
1427 | * Fake timestamp, for unit tests. |
1428 | * @return int|null Unix timestamp, or null to not fake it |
1429 | */ |
1430 | public function fakeTimestamp(): ?int { |
1431 | return null; |
1432 | } |
1433 | |
1434 | /** |
1435 | * Get an array of defined extension tags, with the lower case name in the |
1436 | * key, the value arbitrary. This is the set of extension tags that are |
1437 | * configured in M/W core. $coreExtModules may already be part of it, |
1438 | * but eventually this distinction will disappear since all extension tags |
1439 | * have to be defined against the Parsoid's extension API. |
1440 | * |
1441 | * @return array |
1442 | */ |
1443 | abstract protected function getNonNativeExtensionTags(): array; |
1444 | |
1445 | /** |
1446 | * Return an object factory to use when instantiating extensions. |
1447 | * (This is assumed to be plumbed up to an appropriate service container.) |
1448 | * @return ObjectFactory The object factory to use for extensions |
1449 | */ |
1450 | public function getObjectFactory(): ObjectFactory { |
1451 | // Default implementation returns an object factory with an |
1452 | // empty service container. |
1453 | return new ObjectFactory( new class() implements ContainerInterface { |
1454 | |
1455 | /** |
1456 | * @param string $id |
1457 | * @return never |
1458 | */ |
1459 | public function get( $id ) { |
1460 | throw new class( "Empty service container" ) extends \Error |
1461 | implements NotFoundExceptionInterface { |
1462 | }; |
1463 | } |
1464 | |
1465 | /** |
1466 | * @param string $id |
1467 | * @return false |
1468 | */ |
1469 | public function has( $id ): bool { |
1470 | return false; |
1471 | } |
1472 | } ); |
1473 | } |
1474 | |
1475 | /** |
1476 | * Whether to validate extension module's configuration arrays |
1477 | * against the schema. Returns true by default. Subclasses |
1478 | * should return true when running tests, but may elect to return |
1479 | * false in production. |
1480 | */ |
1481 | protected function shouldValidateExtConfig(): bool { |
1482 | return true; |
1483 | } |
1484 | |
1485 | /** |
1486 | * FIXME: might benefit from T250230 (caching) but see T270307 -- |
1487 | * currently SiteConfig::unregisterExtensionModule() is called |
1488 | * during testing, which requires invalidating $this->extConfig. |
1489 | * (See also SiteConfig::fakeTimestamp() etc.) We'd probably need |
1490 | * to more fully separate/mock the "testing SiteConfig" as well |
1491 | * as provide a way for parser options to en/disable individual |
1492 | * registered modules before this class can be considered immutable |
1493 | * and cached. |
1494 | */ |
1495 | private function constructExtConfig() { |
1496 | $this->extConfig = [ |
1497 | 'allTags' => [], |
1498 | 'parsoidExtTags' => [], |
1499 | 'annotationTags' => [], |
1500 | 'domProcessors' => [], |
1501 | 'annotationStrippers' => [], |
1502 | 'contentModels' => [], |
1503 | 'pFragmentHandlers' => [], |
1504 | ]; |
1505 | |
1506 | // There may be some tags defined by the parent wiki which have no |
1507 | // associated parsoid modules; for now we handle these by invoking |
1508 | // the legacy parser. |
1509 | $this->extConfig['allTags'] = $this->getNonNativeExtensionTags(); |
1510 | |
1511 | // Reset the list of PFragment handler synonyms; they will be recreated |
1512 | // as we process the extension modules. |
1513 | $this->pFragmentHandlerFuncSynonyms = [ [], [], ]; |
1514 | |
1515 | foreach ( $this->getExtensionModules() as $module ) { |
1516 | $this->processExtensionModule( $module ); |
1517 | } |
1518 | } |
1519 | |
1520 | /** |
1521 | * @param string $lowerTagName |
1522 | * @return bool |
1523 | */ |
1524 | public function tagNeedsNowikiStrippedInTagPF( string $lowerTagName ): bool { |
1525 | return isset( $this->t299103Tags[$lowerTagName] ); |
1526 | } |
1527 | |
1528 | /** |
1529 | * Return the JSON Schema for Extension Modules. |
1530 | */ |
1531 | private static function getExtensionModuleSchema(): object { |
1532 | static $schema = null; |
1533 | if ( $schema === null ) { |
1534 | $schemaPath = __DIR__ . '/../Ext/moduleconfig.schema.json'; |
1535 | $schema = json_decode( file_get_contents( $schemaPath ) ); |
1536 | } |
1537 | return $schema; |
1538 | } |
1539 | |
1540 | /** |
1541 | * Register a Parsoid-compatible extension |
1542 | * @param ExtensionModule $ext |
1543 | */ |
1544 | protected function processExtensionModule( ExtensionModule $ext ): void { |
1545 | Assert::invariant( $this->extConfig !== null, "not yet inited!" ); |
1546 | $extConfig = $ext->getConfig(); |
1547 | Assert::invariant( |
1548 | isset( $extConfig['name'] ), |
1549 | "Every extension module must have a name." |
1550 | ); |
1551 | if ( $this->shouldValidateExtConfig() ) { |
1552 | $validator = new Validator; |
1553 | $validator->validate( |
1554 | $extConfig, |
1555 | self::getExtensionModuleSchema(), |
1556 | Constraint::CHECK_MODE_TYPE_CAST // allow associative arrays |
1557 | ); |
1558 | Assert::invariant( |
1559 | $validator->isValid(), |
1560 | "Found errors when validating " . |
1561 | $extConfig['name'] . " ExtensionModule config: " . |
1562 | json_encode( $validator->getErrors(), JSON_PRETTY_PRINT ) |
1563 | ); |
1564 | } |
1565 | $name = $extConfig['name']; |
1566 | |
1567 | // These are extension tag handlers. They have |
1568 | // wt2html (sourceToDom), html2wt (domToWikitext), and |
1569 | // linter functionality. |
1570 | foreach ( $extConfig['tags'] ?? [] as $tagConfig ) { |
1571 | $lowerTagName = mb_strtolower( $tagConfig['name'] ); |
1572 | $this->extConfig['allTags'][$lowerTagName] = true; |
1573 | $this->extConfig['parsoidExtTags'][$lowerTagName] = $tagConfig; |
1574 | // Deal with b/c nowiki stripping support needed by some extensions. |
1575 | // This register this tag with the legacy parser for |
1576 | // implicit nowiki stripping in {{#tag:..}} args for this tag. |
1577 | if ( isset( $tagConfig['options']['stripNowiki'] ) ) { |
1578 | $this->t299103Tags[$lowerTagName] = true; |
1579 | } |
1580 | } |
1581 | |
1582 | if ( isset( $extConfig['annotations'] ) ) { |
1583 | $annotationConfig = $extConfig['annotations']; |
1584 | $annotationTags = $annotationConfig['tagNames'] ?? []; |
1585 | foreach ( $annotationTags as $aTag ) { |
1586 | $lowerTagName = mb_strtolower( $aTag ); |
1587 | $this->extConfig['allTags'][$lowerTagName] = true; |
1588 | $this->extConfig['annotationTags'][$lowerTagName] = true; |
1589 | } |
1590 | if ( isset( $annotationConfig['annotationStripper'] ) ) { |
1591 | $obj = $this->getObjectFactory()->createObject( $annotationConfig['annotationStripper'], [ |
1592 | 'allowClassName' => true, |
1593 | 'assertClass' => AnnotationStripper::class, |
1594 | ] ); |
1595 | $this->extConfig['annotationStrippers'][$name] = $obj; |
1596 | } |
1597 | } |
1598 | |
1599 | $this->populateMagicWords(); |
1600 | $magicWordMap = $this->getMagicWords(); |
1601 | // PFragment handlers are named using magic words |
1602 | foreach ( $extConfig['pFragmentHandlers'] ?? $extConfig['fragmentHandlers'] ?? [] as $pFragmentHandler ) { |
1603 | $key = $pFragmentHandler['key'] ?? null; # A magic word |
1604 | if ( !$key ) { |
1605 | continue; |
1606 | } |
1607 | # transfer information about the extension and parsoid module |
1608 | # in which this fragment handler is defined |
1609 | $pFragmentHandler['module-name'] = $extConfig['name']; |
1610 | $pFragmentHandler['extension-name'] = $extConfig['extension-name'] ?? null; |
1611 | $this->extConfig['pFragmentHandlers'][$key] = $pFragmentHandler; |
1612 | if ( !array_key_exists( $key, $magicWordMap ) ) { |
1613 | continue; |
1614 | } |
1615 | // Case-insensitive is deprecated! T389029 |
1616 | $caseSensitive = $magicWordMap[$key][0] ?? 0; |
1617 | foreach ( $this->mwAliases[$key] as $alias ) { |
1618 | if ( isset( $pFragmentHandler['options']['parserFunction'] ) ) { |
1619 | # 'hash' is the default; for legacy compatibility a few |
1620 | # parser functions are defined without a hash or have |
1621 | # the hash already prepended to the magic word alias |
1622 | $pfAlias = $alias; |
1623 | if ( !isset( $pFragmentHandler['options']['nohash'] ) ) { |
1624 | $pfAlias = '#' . $pfAlias; |
1625 | } |
1626 | $this->pFragmentHandlerFuncSynonyms[$caseSensitive][$pfAlias] = $key; |
1627 | } |
1628 | // TODO (T390342): ['options']['extensionTag'] can also be set, |
1629 | // and we would register this PFragment handler as a |
1630 | // localizable (!) extension tag. |
1631 | // $this->pFragmentHandlerTagSynonyms[$case][$alias]=$key; |
1632 | } |
1633 | } |
1634 | |
1635 | // Extension modules may also register dom processors. |
1636 | // This is for wt2htmlPostProcessor and html2wtPreProcessor |
1637 | // functionality. |
1638 | if ( isset( $extConfig['domProcessors'] ) ) { |
1639 | $this->extConfig['domProcessors'][$name] = $extConfig['domProcessors']; |
1640 | } |
1641 | |
1642 | foreach ( $extConfig['contentModels'] ?? [] as $cm => $spec ) { |
1643 | // For compatibility with mediawiki core, the first |
1644 | // registered extension wins. |
1645 | if ( isset( $this->extConfig['contentModels'][$cm] ) ) { |
1646 | continue; |
1647 | } |
1648 | $handler = $this->getObjectFactory()->createObject( $spec, [ |
1649 | 'allowClassName' => true, |
1650 | 'assertClass' => ContentModelHandler::class, |
1651 | ] ); |
1652 | $this->extConfig['contentModels'][$cm] = $handler; |
1653 | } |
1654 | |
1655 | // Extension modules can register new PFragment types |
1656 | foreach ( $extConfig['PFragmentTypes'] ?? [] as $pfClass ) { |
1657 | PFragment::registerFragmentClass( $pfClass ); |
1658 | } |
1659 | } |
1660 | |
1661 | protected function getExtConfig(): array { |
1662 | if ( !$this->extConfig ) { |
1663 | $this->constructExtConfig(); |
1664 | } |
1665 | return $this->extConfig; |
1666 | } |
1667 | |
1668 | /** |
1669 | * Return a ContentModelHandler for the specified $contentmodel, if one is registered. |
1670 | * If null is returned, will use the default wikitext content model handler. |
1671 | * |
1672 | * @param string $contentmodel |
1673 | * @return ContentModelHandler|null |
1674 | */ |
1675 | public function getContentModelHandler( string $contentmodel ): ?ContentModelHandler { |
1676 | return ( $this->getExtConfig() )['contentModels'][$contentmodel] ?? null; |
1677 | } |
1678 | |
1679 | /** |
1680 | * Returns all the annotationStrippers that are defined as annotation configuration |
1681 | * @return array<AnnotationStripper> |
1682 | */ |
1683 | public function getAnnotationStrippers(): array { |
1684 | $res = $this->getExtConfig()['annotationStrippers'] ?? []; |
1685 | // ensures stability of the method list order |
1686 | ksort( $res ); |
1687 | return array_values( $res ); |
1688 | } |
1689 | |
1690 | /** |
1691 | * Determine whether a given name, which must have already been converted |
1692 | * to lower case, is a valid extension tag name. |
1693 | * |
1694 | * @param string $name |
1695 | * @return bool |
1696 | */ |
1697 | public function isExtensionTag( string $name ): bool { |
1698 | return isset( $this->getExtensionTagNameMap()[$name] ); |
1699 | } |
1700 | |
1701 | /** |
1702 | * @param string $tagName is $tagName an annotation tag? |
1703 | * @return bool |
1704 | */ |
1705 | public function isAnnotationTag( string $tagName ): bool { |
1706 | return $this->getExtConfig()['annotationTags'][mb_strtolower( $tagName )] ?? false; |
1707 | } |
1708 | |
1709 | /** |
1710 | * Get an array of defined annotation tags in lower case |
1711 | * @return array |
1712 | */ |
1713 | public function getAnnotationTags(): array { |
1714 | $extConfig = $this->getExtConfig(); |
1715 | return array_keys( $extConfig['annotationTags'] ); |
1716 | } |
1717 | |
1718 | /** |
1719 | * Get an array of defined extension tags, with the lower case name |
1720 | * in the key, and the value being arbitrary. |
1721 | * |
1722 | * @return array<string,true> |
1723 | */ |
1724 | public function getExtensionTagNameMap(): array { |
1725 | $extConfig = $this->getExtConfig(); |
1726 | return $extConfig['allTags']; |
1727 | } |
1728 | |
1729 | /** |
1730 | * @param string $tagName Extension tag name |
1731 | * @return array|null |
1732 | */ |
1733 | public function getExtTagConfig( string $tagName ): ?array { |
1734 | $extConfig = $this->getExtConfig(); |
1735 | return $extConfig['parsoidExtTags'][mb_strtolower( $tagName )] ?? null; |
1736 | } |
1737 | |
1738 | /** @var array<string,?ExtensionTagHandler> */ |
1739 | private array $tagHandlerCache = []; |
1740 | /** @var array<string,?PFragmentHandler> */ |
1741 | private array $pFragmentHandlerCache = []; |
1742 | |
1743 | /** |
1744 | * @param string $tagName Extension tag name |
1745 | * @return ExtensionTagHandler|null |
1746 | * Returns the implementation of the named extension, if there is one. |
1747 | */ |
1748 | public function getExtTagImpl( string $tagName ): ?ExtensionTagHandler { |
1749 | $tagName = mb_strtolower( $tagName ); |
1750 | if ( !array_key_exists( $tagName, $this->tagHandlerCache ) ) { |
1751 | $tagConfig = $this->getExtTagConfig( $tagName ); |
1752 | $this->tagHandlerCache[$tagName] = isset( $tagConfig['handler'] ) ? |
1753 | $this->getObjectFactory()->createObject( $tagConfig['handler'], [ |
1754 | 'allowClassName' => true, |
1755 | 'assertClass' => ExtensionTagHandler::class, |
1756 | ] ) : null; |
1757 | } |
1758 | |
1759 | return $this->tagHandlerCache[$tagName]; |
1760 | } |
1761 | |
1762 | /** |
1763 | * @return list<string> Magic word IDs naming PFragment handlers |
1764 | * registered with Parsoid. |
1765 | */ |
1766 | public function getPFragmentHandlerKeys() { |
1767 | $extConfig = $this->getExtConfig(); |
1768 | return array_keys( $extConfig['pFragmentHandlers'] ?? [] ); |
1769 | } |
1770 | |
1771 | /** |
1772 | * @param string $key Magic word ID naming this PFragment handler |
1773 | * @return array{handler?:string|array}|null Configuration for the |
1774 | * fragment handler, including a 'handler' property which contains |
1775 | * an object factory specification for a PFragmentHandler. |
1776 | */ |
1777 | public function getPFragmentHandlerConfig( string $key ) { |
1778 | $extConfig = $this->getExtConfig(); |
1779 | return $extConfig['pFragmentHandlers'][$key] ?? null; |
1780 | } |
1781 | |
1782 | /** |
1783 | * @param string $key Magic word ID naming this PFragment handler |
1784 | * |
1785 | * @return ?PFragmentHandler |
1786 | */ |
1787 | public function getPFragmentHandlerImpl( string $key ): ?PFragmentHandler { |
1788 | if ( !array_key_exists( $key, $this->pFragmentHandlerCache ) ) { |
1789 | $handlerConfig = $this->getPFragmentHandlerConfig( $key ); |
1790 | $this->pFragmentHandlerCache[$key] = isset( $handlerConfig['handler'] ) ? |
1791 | $this->getObjectFactory()->createObject( $handlerConfig['handler'], [ |
1792 | 'allowClassName' => true, |
1793 | 'assertClass' => PFragmentHandler::class, |
1794 | ] ) : null; |
1795 | } |
1796 | |
1797 | return $this->pFragmentHandlerCache[$key]; |
1798 | } |
1799 | |
1800 | /** |
1801 | * Return an array mapping extension name to an array of object factory |
1802 | * specs for Ext\DOMProcessor objects |
1803 | * @return array<name,list<string|array>> |
1804 | */ |
1805 | public function getExtDOMProcessors(): array { |
1806 | $extConfig = $this->getExtConfig(); |
1807 | return $extConfig['domProcessors']; |
1808 | } |
1809 | |
1810 | /** |
1811 | * Return the localization key we should use for asynchronous |
1812 | * fallback content. |
1813 | */ |
1814 | public function getAsyncFallbackMessageKey(): string { |
1815 | return 'parsoid-async-not-ready-fallback'; |
1816 | } |
1817 | |
1818 | /** @var array<string,int> */ |
1819 | protected $wt2htmlLimits = [ |
1820 | // We won't handle pages beyond this size |
1821 | 'wikitextSize' => 2048 * 1024, // ParserOptions::maxIncludeSize |
1822 | |
1823 | // Max list items per page |
1824 | 'listItem' => 30000, |
1825 | |
1826 | // Max table cells per page |
1827 | 'tableCell' => 30000, |
1828 | |
1829 | // Max transclusions per page |
1830 | 'transclusion' => 10000, |
1831 | |
1832 | // DISABLED for now |
1833 | // Max images per page |
1834 | 'image' => 1000, |
1835 | |
1836 | // Max top-level token size |
1837 | 'token' => 1000000, // 1M |
1838 | ]; |
1839 | |
1840 | /** |
1841 | * @return array<string,int> |
1842 | */ |
1843 | public function getWt2HtmlLimits(): array { |
1844 | return $this->wt2htmlLimits; |
1845 | } |
1846 | |
1847 | /** @var array<string,int> */ |
1848 | protected $html2wtLimits = [ |
1849 | // We refuse to serialize HTML strings bigger than this |
1850 | 'htmlSize' => 10000000, // 10M |
1851 | ]; |
1852 | |
1853 | /** |
1854 | * @return array<string,int> |
1855 | */ |
1856 | public function getHtml2WtLimits(): array { |
1857 | return $this->html2wtLimits; |
1858 | } |
1859 | |
1860 | /** |
1861 | * @param ?string $filePath File to log to (if null, logs to console) |
1862 | * @return Logger |
1863 | */ |
1864 | public static function createLogger( ?string $filePath = null ): Logger { |
1865 | // Use Monolog's PHP console handler |
1866 | $logger = new Logger( "Parsoid CLI" ); |
1867 | $format = '%message%'; |
1868 | if ( $filePath ) { |
1869 | $handler = new StreamHandler( $filePath ); |
1870 | $format .= "\n"; |
1871 | } else { |
1872 | $handler = new ErrorLogHandler(); |
1873 | } |
1874 | // Don't suppress inline newlines |
1875 | $handler->setFormatter( new LineFormatter( $format, null, true ) ); |
1876 | $logger->pushHandler( $handler ); |
1877 | |
1878 | if ( $filePath ) { |
1879 | // Separator between logs since StreamHandler appends |
1880 | $logger->log( Logger::INFO, "-------------- starting fresh log --------------" ); |
1881 | } |
1882 | |
1883 | return $logger; |
1884 | } |
1885 | |
1886 | abstract public function getNoFollowConfig(): array; |
1887 | |
1888 | /** @return string|false */ |
1889 | abstract public function getExternalLinkTarget(); |
1890 | } |