Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
13.47% |
33 / 245 |
|
20.69% |
6 / 29 |
CRAP | |
0.00% |
0 / 1 |
Utils | |
13.47% |
33 / 245 |
|
20.69% |
6 / 29 |
4019.84 | |
0.00% |
0 / 1 |
stripParsoidIdPrefix | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripNamespace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isParsoidObjectId | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isVoidElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
recursiveClone | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
clone | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
56 | |||
lastUniChar | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
isUniWord | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
phpURLEncode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeURI | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
decodeURIComponent | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
extractExtBody | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isValidOffset | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isValidDSR | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 | |||
normalizeNamespaceName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeWtEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
escapeWtEntities | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
escapeWt | |
0.00% |
0 / 53 |
|
0.00% |
0 / 1 |
42 | |||
escapeHtml | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
entityEncodeAll | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
isProtocolValid | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
getExtArgInfo | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
parseMediaDimensions | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
90 | |||
validateMediaParam | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
getStar | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
isLinkTrail | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
bcp47ToMwCode | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
6 | |||
mwCodeToBcp47 | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
182 | |||
isBcp47CodeEqual | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Psr\Log\LoggerInterface; |
7 | use Wikimedia\Bcp47Code\Bcp47Code; |
8 | use Wikimedia\Bcp47Code\Bcp47CodeValue; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Config\SiteConfig; |
11 | use Wikimedia\Parsoid\Core\DomSourceRange; |
12 | use Wikimedia\Parsoid\Core\Sanitizer; |
13 | use Wikimedia\Parsoid\NodeData\DataMw; |
14 | use Wikimedia\Parsoid\Tokens\Token; |
15 | use Wikimedia\Parsoid\Wikitext\Consts; |
16 | |
17 | /** |
18 | * This file contains general utilities for token transforms. |
19 | */ |
20 | class Utils { |
21 | /** |
22 | * Regular expression fragment for matching wikitext comments. |
23 | * Meant for inclusion in other regular expressions. |
24 | */ |
25 | // Maintenance note: this is used in /x regexes so all whitespace and # should be escaped |
26 | public const COMMENT_REGEXP_FRAGMENT = '<!--(?>[\s\S]*?-->)'; |
27 | /** Regular fragment for matching a wikitext comment */ |
28 | public const COMMENT_REGEXP = '/' . self::COMMENT_REGEXP_FRAGMENT . '/'; |
29 | |
30 | /** |
31 | * Strip Parsoid id prefix from aboutID |
32 | * |
33 | * @param string $aboutId aboud ID string |
34 | * @return string |
35 | */ |
36 | public static function stripParsoidIdPrefix( string $aboutId ): string { |
37 | // 'mwt' is the prefix used for new ids |
38 | return preg_replace( '/^#?mwt/', '', $aboutId ); |
39 | } |
40 | |
41 | /** |
42 | * Strip PHP namespace from the fully qualified class name |
43 | * @param string $className |
44 | * @return string |
45 | */ |
46 | public static function stripNamespace( string $className ): string { |
47 | return preg_replace( '/.*\\\\/', '', $className ); |
48 | } |
49 | |
50 | /** |
51 | * Check for Parsoid id prefix in an aboutID string |
52 | * |
53 | * @param string $aboutId aboud ID string |
54 | * @return bool |
55 | */ |
56 | public static function isParsoidObjectId( string $aboutId ): bool { |
57 | // 'mwt' is the prefix used for new ids |
58 | return str_starts_with( $aboutId, '#mwt' ); |
59 | } |
60 | |
61 | /** |
62 | * Determine if the named tag is void (can not have content). |
63 | * |
64 | * @param string $name tag name |
65 | * @return bool |
66 | */ |
67 | public static function isVoidElement( string $name ): bool { |
68 | return isset( Consts::$HTML['VoidTags'][$name] ); |
69 | } |
70 | |
71 | /** |
72 | * recursive deep clones helper function |
73 | * |
74 | * @param object $el object |
75 | * @return object |
76 | */ |
77 | private static function recursiveClone( $el ) { |
78 | return self::clone( $el, true ); |
79 | } |
80 | |
81 | /** |
82 | * Deep clones by default. |
83 | * @param object|array $obj arrays or plain objects |
84 | * Tokens or DOM nodes shouldn't be passed in. |
85 | * |
86 | * CAVEAT: It looks like debugging methods pass in arrays |
87 | * that can have DOM nodes. So, for debugging purposes, |
88 | * we handle top-level DOM nodes or DOM nodes embedded in arrays |
89 | * But, this will miserably fail if an object embeds a DOM node. |
90 | * |
91 | * @param bool $deepClone |
92 | * @param bool $debug |
93 | * @return object|array |
94 | */ |
95 | public static function clone( $obj, $deepClone = true, $debug = false ) { |
96 | if ( $debug ) { |
97 | if ( $obj instanceof \DOMNode ) { |
98 | return $obj->cloneNode( $deepClone ); |
99 | } |
100 | if ( is_array( $obj ) ) { |
101 | if ( $deepClone ) { |
102 | return array_map( |
103 | static function ( $o ) { |
104 | return Utils::clone( $o, true, true ); |
105 | }, |
106 | $obj |
107 | ); |
108 | } else { |
109 | return $obj; // Copy-on-write cloning |
110 | } |
111 | } |
112 | } |
113 | |
114 | if ( !$deepClone && is_object( $obj ) ) { |
115 | return clone $obj; |
116 | } |
117 | |
118 | // FIXME, see T161647 |
119 | // This will fail if $obj is (or embeds) a DOMNode |
120 | return unserialize( serialize( $obj ) ); |
121 | } |
122 | |
123 | /** |
124 | * Extract the last *unicode* character of the string. |
125 | * This might be more than one byte, if the last character |
126 | * is non-ASCII. |
127 | * @param string $str |
128 | * @param ?int $idx The index *after* the character to extract; defaults |
129 | * to the length of $str, which will extract the last character in |
130 | * $str. |
131 | * @return string |
132 | */ |
133 | public static function lastUniChar( string $str, ?int $idx = null ): string { |
134 | if ( $idx === null ) { |
135 | $idx = strlen( $str ); |
136 | } elseif ( $idx <= 0 || $idx > strlen( $str ) ) { |
137 | return ''; |
138 | } |
139 | $c = $str[--$idx]; |
140 | while ( ( ord( $c ) & 0xC0 ) === 0x80 ) { |
141 | $c = $str[--$idx] . $c; |
142 | } |
143 | return $c; |
144 | } |
145 | |
146 | /** |
147 | * Return true if the first character in $s is a unicode word character. |
148 | * @param string $s |
149 | * @return bool |
150 | */ |
151 | public static function isUniWord( string $s ): bool { |
152 | return preg_match( '#^\w#u', $s ) === 1; |
153 | } |
154 | |
155 | /** |
156 | * This should not be used. |
157 | * @param string $txt URL to encode using PHP encoding |
158 | * @return string |
159 | */ |
160 | public static function phpURLEncode( $txt ) { |
161 | // @phan-suppress-previous-line PhanPluginNeverReturnMethod |
162 | throw new \BadMethodCallException( 'Use urlencode( $txt ) instead' ); |
163 | } |
164 | |
165 | /** |
166 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
167 | * |
168 | * Distinct from `decodeURIComponent` in that certain escapes are not decoded, |
169 | * matching the behavior of JavaScript's decodeURI(). |
170 | * |
171 | * @see https://www.ecma-international.org/ecma-262/6.0/#sec-decodeuri-encodeduri |
172 | * @param string $s URI to be decoded |
173 | * @return string |
174 | */ |
175 | public static function decodeURI( string $s ): string { |
176 | // Escape the '%' in sequences for the reserved characters, then use decodeURIComponent. |
177 | $s = preg_replace( '/%(?=2[346bcfBCF]|3[abdfABDF]|40)/', '%25', $s ); |
178 | return self::decodeURIComponent( $s ); |
179 | } |
180 | |
181 | /** |
182 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
183 | * |
184 | * @param string $s URI to be decoded |
185 | * @return string |
186 | */ |
187 | public static function decodeURIComponent( string $s ): string { |
188 | // Most of the time we should have valid input |
189 | $ret = rawurldecode( $s ); |
190 | if ( mb_check_encoding( $ret, 'UTF-8' ) ) { |
191 | return $ret; |
192 | } |
193 | |
194 | // Extract each encoded character and decode it individually |
195 | return preg_replace_callback( |
196 | // phpcs:ignore Generic.Files.LineLength.TooLong |
197 | '/%[0-7][0-9A-F]|%[CD][0-9A-F]%[89AB][0-9A-F]|%E[0-9A-F](?:%[89AB][0-9A-F]){2}|%F[0-4](?:%[89AB][0-9A-F]){3}/i', |
198 | static function ( $match ) { |
199 | $ret = rawurldecode( $match[0] ); |
200 | return mb_check_encoding( $ret, 'UTF-8' ) ? $ret : $match[0]; |
201 | }, $s |
202 | ); |
203 | } |
204 | |
205 | /** |
206 | * Extract extension source from the token |
207 | * |
208 | * @param Token $token token |
209 | * @return string |
210 | */ |
211 | public static function extractExtBody( Token $token ): string { |
212 | $src = $token->getAttributeV( 'source' ); |
213 | $extTagOffsets = $token->dataParsoid->extTagOffsets; |
214 | '@phan-var \Wikimedia\Parsoid\Core\DomSourceRange $extTagOffsets'; |
215 | return $extTagOffsets->stripTags( $src ); |
216 | } |
217 | |
218 | /** |
219 | * Helper function checks numeric values |
220 | * |
221 | * @param ?int $n checks parameters for numeric type and value zero or positive |
222 | * @return bool |
223 | */ |
224 | private static function isValidOffset( ?int $n ): bool { |
225 | return $n !== null && $n >= 0; |
226 | } |
227 | |
228 | /** |
229 | * Basic check if a DOM Source Range (DSR) is valid. |
230 | * |
231 | * Clarifications about the "basic validity checks": |
232 | * - Only checks for underflow, not for overflow. |
233 | * - Does not verify that start <= end |
234 | * - Does not verify that openWidth + endWidth <= end - start |
235 | * (even so, the values might be invalid because of content) |
236 | * These would be overkill for our purposes. Given how DSR computation |
237 | * works in thie codebase, the real scenarios we care about are |
238 | * non-null / non-negative values since that can happen. |
239 | * |
240 | * @param ?DomSourceRange $dsr DSR source range values |
241 | * @param bool $all Also check the widths of the container tag |
242 | * @return bool |
243 | */ |
244 | public static function isValidDSR( |
245 | ?DomSourceRange $dsr, bool $all = false |
246 | ): bool { |
247 | return $dsr !== null && |
248 | self::isValidOffset( $dsr->start ) && |
249 | self::isValidOffset( $dsr->end ) && |
250 | ( !$all || ( |
251 | self::isValidOffset( $dsr->openWidth ) && |
252 | self::isValidOffset( $dsr->closeWidth ) |
253 | ) |
254 | ); |
255 | } |
256 | |
257 | /** |
258 | * Cannonicalizes a namespace name. |
259 | * |
260 | * @param string $name Non-normalized namespace name. |
261 | * @return string |
262 | */ |
263 | public static function normalizeNamespaceName( string $name ): string { |
264 | return strtr( mb_strtolower( $name ), ' ', '_' ); |
265 | } |
266 | |
267 | /** |
268 | * Decode HTML5 entities in wikitext. |
269 | * |
270 | * NOTE that wikitext only allows semicolon-terminated entities, while |
271 | * HTML allows a number of "legacy" entities to be decoded without |
272 | * a terminating semicolon. This function deliberately does not |
273 | * decode these HTML-only entity forms. |
274 | * |
275 | * @param string $text |
276 | * @return string |
277 | */ |
278 | public static function decodeWtEntities( string $text ): string { |
279 | // Note that HTML5 allows semicolon-less entities which |
280 | // wikitext does not: in wikitext all entities must end in a |
281 | // semicolon. |
282 | // By normalizing before decoding, this routine deliberately |
283 | // does not decode entity references which are invalid in wikitext |
284 | // (mostly because they decode to invalid codepoints). |