Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 78 |
|
0.00% |
0 / 20 |
CRAP | |
0.00% |
0 / 1 |
RemexCollectionMunger | |
0.00% |
0 / 78 |
|
0.00% |
0 / 20 |
1560 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
startCollectionSection | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
startDocument | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
endDocument | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
characters | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
insertElement | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
endTag | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
doctype | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
comment | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
error | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
mergeAttributes | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
removeNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
reparentChildren | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getPosition | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
fixHeading | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
42 | |||
numberHeading | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
fixId | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
90 | |||
getUnreservedId | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
isHeading | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
startsWith | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\Collection; |
4 | |
5 | use Wikimedia\RemexHtml\Serializer\Serializer; |
6 | use Wikimedia\RemexHtml\Tokenizer\Attributes; |
7 | use Wikimedia\RemexHtml\TreeBuilder\Element; |
8 | use Wikimedia\RemexHtml\TreeBuilder\TreeHandler; |
9 | |
10 | /** |
11 | * DOM tree munger for RemexHtml that makes small adjustments to a HTML document for including |
12 | * in a collection (a HTML document that's more or less the concatenation of multiple original |
13 | * documents). |
14 | * |
15 | * The munger is reused for parsing multiple documents and outputs a single unified document. |
16 | * It makes small changes to make the resulting document valid and look good: |
17 | * - converts h1 to h2 while preserving heading structure |
18 | * - removes the document name from before self-references |
19 | * - renames conflicting ids |
20 | * - optionally adds numbers before the sections |
21 | */ |
22 | class RemexCollectionMunger implements TreeHandler { |
23 | |
24 | /** |
25 | * @var array |
26 | */ |
27 | private $options; |
28 | |
29 | /** |
30 | * Map from original document ID to collection document ID. |
31 | * A value of false means that the ID is reserved and upon encountering it a new mapping |
32 | * to a free id needs to be created. A value of true means the ID is used (ie. not reserved |
33 | * but will have to be in the next document). |
34 | * @var array |
35 | */ |
36 | private $idMap = []; |
37 | |
38 | /** |
39 | * Reference to section data. id and level will be updated to keep in sync with document changes. |
40 | * @var array[] [[ title => ..., id => ..., level => ... ], ...] |
41 | */ |
42 | private $sectionRef; |
43 | |
44 | /** |
45 | * 1-based index for the current source document in the list of source documents. |
46 | * @var int |
47 | */ |
48 | private $documentIndex = 0; |
49 | |
50 | /** |
51 | * URL for the current document, relative to its base URL. For a Parsoid document this will |
52 | * be something like './Title'. |
53 | * @var string |
54 | */ |
55 | private $selfLink; |
56 | |
57 | /** |
58 | * Tracks how many levels headings need to be moved. E.g. a document with h1;h2;h3 |
59 | * needs to be transformed to h2;h3;h4 while a document with h2;h3;h1 to h2;h3;h2 |
60 | * so we set $headingDisplacementLevel when encountering h1 and use it decide what to |
61 | * do with other headings. |
62 | * @var int |
63 | */ |
64 | private $headingDisplacementLevel = 0; |
65 | |
66 | /** |
67 | * @var HeadingCounter A counter for section numbers. |
68 | */ |
69 | private $sectionCounter; |
70 | |
71 | /** |
72 | * Source document end position. |
73 | * @var int |
74 | */ |
75 | private $endPos; |
76 | |
77 | /** @var Serializer */ |
78 | private $serializer; |
79 | |
80 | /** |
81 | * @param Serializer $serializer |
82 | * @param array $options |
83 | * - topHeadingLevel: highest allowed heading level (e.g. '2' means h1 is disallowed and will |
84 | * be "pushed down") |
85 | */ |
86 | public function __construct( Serializer $serializer, $options = [] ) { |
87 | $this->serializer = $serializer; |
88 | $this->options = $options + [ |
89 | 'topHeadingLevel' => 2, |
90 | 'numberSections' => true, |
91 | ]; |
92 | } |
93 | |
94 | /** |
95 | * Reset internal state. Needs to be called before parsing a new source document. |
96 | * @param string $selfLink URL prefix before # which means this is a local URL |
97 | * @param array[] &$sections Section data; each section is a triple |
98 | * [ title => ..., id => ..., level => ... ]. RemexCollectionMunger will update the id/level |
99 | * to keep in sync with document changes. |
100 | * @param HeadingCounter $sectionCounter |
101 | */ |
102 | public function startCollectionSection( $selfLink, &$sections, HeadingCounter $sectionCounter ) { |
103 | $this->documentIndex++; |
104 | $this->headingDisplacementLevel = 0; |
105 | // set all mappings to false: they are only valid within a single source document |
106 | $this->idMap = array_fill_keys( array_keys( $this->idMap ), false ); |
107 | $this->sectionRef = &$sections; |
108 | $this->selfLink = $selfLink; |
109 | $this->sectionCounter = $sectionCounter; |
110 | } |
111 | |
112 | /** |
113 | * Called by RemexHTML when parsing of a source document starts. |
114 | * @inheritDoc |
115 | */ |
116 | public function startDocument( $fragmentNamespace, $fragmentName ) { |
117 | // This will emit a doctype even if fragment name is set. It needs to be |
118 | // removed manually after getting the result from the Formatter. |
119 | $this->serializer->startDocument( $fragmentNamespace, $fragmentName ); |
120 | } |
121 | |
122 | /** |
123 | * Called by RemexHTML when parsing stops. |
124 | * @param int $pos The input string length, i.e. the past-the-end position. |
125 | */ |
126 | public function endDocument( $pos ) { |
127 | $this->endPos = $pos; |
128 | $this->serializer->endDocument( $this->getPosition( $pos ) ); |
129 | // do nothing - this is not necessarily the end of the output document. |
130 | } |
131 | |
132 | /** |
133 | * Called by RemexHTML when parsing characters. |
134 | * @inheritDoc |
135 | */ |
136 | public function characters( |
137 | $preposition, $ref, $text, $start, $length, $sourceStart, $sourceLength |
138 | ) { |
139 | $this->serializer->characters( $preposition, $ref, $text, $start, $length, |
140 | $this->getPosition( $sourceStart ), $sourceLength ); |
141 | } |
142 | |
143 | /** |
144 | * Called by RemexHTML when parsing an element. |
145 | * @inheritDoc |
146 | */ |
147 | public function insertElement( |
148 | $preposition, $ref, Element $element, $void, $sourceStart, $sourceLength |
149 | ) { |
150 | // if the serializer has already seen this element, we already munged it |
151 | if ( !$element->userData ) { |
152 | $this->fixHeading( $element ); |
153 | $this->numberHeading( $element ); |
154 | $this->fixId( $element->attrs, $element ); |
155 | } |
156 | $this->serializer->insertElement( $preposition, $ref, $element, $void, |
157 | $this->getPosition( $sourceStart ), $sourceLength ); |
158 | } |
159 | |
160 | /** |
161 | * Called by RemexHTML when parsing an end tag. |
162 | * @inheritDoc |
163 | */ |
164 | public function endTag( Element $element, $sourceStart, $sourceLength ) { |
165 | $this->serializer->endTag( $element, $this->getPosition( $sourceStart ), $sourceLength ); |
166 | } |
167 | |
168 | /** |
169 | * Called by RemexHTML when parsing a doctype declaration. |
170 | * @inheritDoc |
171 | */ |
172 | public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { |
173 | // we only need the body so no point in forwarding this |
174 | } |
175 | |
176 | /** |
177 | * Called by RemexHTML when parsing a comment. |
178 | * @inheritDoc |
179 | */ |
180 | public function comment( $preposition, $ref, $text, $sourceStart, $sourceLength ) { |
181 | $this->serializer->comment( $preposition, $ref, $text, |
182 | $this->getPosition( $sourceStart ), $sourceLength ); |
183 | } |
184 | |
185 | /** |
186 | * Called by RemexHTML on parse errors. |
187 | * @inheritDoc |
188 | */ |
189 | public function error( $text, $pos ) { |
190 | $this->serializer->error( $text, $this->getPosition( $pos ) ); |
191 | } |
192 | |
193 | /** |
194 | * Called by RemexHTML when updating element attributes. |
195 | * @inheritDoc |
196 | */ |
197 | public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) { |
198 | // RemexHTML should only call this method for <html> and <body> which we discard |
199 | // so there is probably no need to fix ids but do it anyway just in case |
200 | $this->fixId( $attrs, $element ); |
201 | $this->serializer->mergeAttributes( $element, $attrs, $this->getPosition( $sourceStart ) ); |
202 | } |
203 | |
204 | /** |
205 | * Called by RemexHTML in some edge cases when fixing invalid HTML. |
206 | * @inheritDoc |
207 | */ |
208 | public function removeNode( Element $element, $sourceStart ) { |
209 | $this->serializer->removeNode( $element, $this->getPosition( $sourceStart ) ); |
210 | } |
211 | |
212 | /** |
213 | * Called by RemexHTML in some edge cases when fixing invalid HTML. |
214 | * @inheritDoc |
215 | */ |
216 | public function reparentChildren( Element $element, Element $newParent, $sourceStart ) { |
217 | $this->serializer->reparentChildren( $element, $newParent, $this->getPosition( $sourceStart ) ); |
218 | } |
219 | |
220 | /** |
221 | * Translate a position in one of the source documents to a position in the document collection. |
222 | * This is only used for debugging so we just generate a number which makes it obvious where |
223 | * to look in the source documents. |
224 | * @param int $originalSourceStart |
225 | * @return int |
226 | */ |
227 | private function getPosition( $originalSourceStart ) { |
228 | // "concatenate" document index and position within document. |
229 | // this leaves ~100MB index space for each document which is plenty, and still fits |
230 | // comfortably into an int even on 32-bit builds. |
231 | return (int)( $this->documentIndex * 1e8 + $originalSourceStart ); |
232 | } |
233 | |
234 | /** |
235 | * Fix $element if it is a heading with the wrong level. |
236 | * h1 and maybe h2 are reserved for chapter/article titles, if we encounter any, |
237 | * force the whole heading structure to be on a lower level. |
238 | * @param Element $element |
239 | */ |
240 | private function fixHeading( $element ) { |
241 | if ( !$this->isHeading( $element ) ) { |
242 | return; |
243 | } |
244 | |
245 | $level = (int)substr( $element->htmlName, 1 ); |
246 | $displace = max( $this->headingDisplacementLevel, $this->options['topHeadingLevel'] - $level ); |
247 | $this->headingDisplacementLevel = $displace; |
248 | $newLevel = min( $level + $displace, 6 ); |
249 | if ( $newLevel !== $level ) { |
250 | // update section data |
251 | if ( isset( $element->attrs['id'] ) ) { |
252 | foreach ( $this->sectionRef as &$section ) { |
253 | if ( $section['id'] === $element->attrs['id'] ) { |
254 | $section['level'] = $newLevel; |
255 | } |
256 | } |
257 | } |
258 | $element->name = $element->htmlName = 'h' . $newLevel; |
259 | } |
260 | } |
261 | |
262 | /** |
263 | * Add numbers before section/chapter/article titles if configured to do so. |
264 | * Section numbers are hierarchic, e.g. subsection 4 of section 3 of article 2 |
265 | * (of a book with no chapters) will be numbered "2.3.4". |
266 | * @param Element $element |
267 | */ |
268 | private function numberHeading( $element ) { |
269 | if ( !$this->isHeading( $element ) ) { |
270 | return; |
271 | } |
272 | $level = (int)substr( $element->htmlName, 1 ); |
273 | if ( $this->options['numberSections'] ) { |
274 | // Add the section number as a data element that can be displayed via CSS. |
275 | // This is more semantic and probably more safe as well than trying to change |
276 | // the content of a tag while Remex is parsing it. |
277 | // Ideally such numbers would be added via CSS counters but that's problematic |
278 | // because sections are not hierarchic in the DOM tree and they can have gaps |
279 | // - e.g. we can have "<h2/><h4/>" in which case we want the section numbers |
280 | // to be "1 1.1" and not "1 1.0.1". |
281 | $element->attrs['data-mw-sectionnumber'] = $this->sectionCounter->incrementAndGet( $level ); |
282 | } |
283 | } |
284 | |
285 | /** |
286 | * Fix $element if it has or refers to an id which conflicts with an id in another document. |
287 | * Needed to prevent id conflicts (e.g. two documents using the same section name). Also fix |
288 | * Parsoid internal references to be #section, not ./Title#section. |
289 | * @param Attributes $attrs |
290 | * @param Element $element |
291 | */ |
292 | private function fixId( $attrs, $element ) { |
293 | if ( isset( $attrs['id'] ) ) { |
294 | $newId = $this->getUnreservedId( $attrs['id'] ); |
295 | if ( $newId !== $attrs['id'] ) { |
296 | // if we renamed a heading anchor, update section data |
297 | if ( $this->isHeading( $element ) ) { |
298 | foreach ( $this->sectionRef as &$section ) { |
299 | if ( $section['id'] === $attrs['id'] ) { |
300 | $section['id'] = $newId; |
301 | break; |
302 | } |
303 | } |
304 | } |
305 | $attrs['id'] = $newId; |
306 | } |
307 | } |
308 | // Make sure local references are in sync with ids. |
309 | // We don't try to update cross-document references, too much effort. |
310 | if ( |
311 | $element->htmlName === 'a' && isset( $attrs['href'] ) |
312 | && $this->startsWith( $attrs['href'], $this->selfLink . '#' ) |
313 | ) { |
314 | $id = (int)substr( $attrs['href'], strlen( $this->selfLink ) + 1 ); |
315 | $id = $this->getUnreservedId( $id ); |
316 | $attrs['href'] = '#' . $id; |
317 | } |
318 | } |
319 | |
320 | /** |
321 | * Get an unreserved id and update the mapping. |
322 | * Will return $id if it does not conflict with earlier documents; otherwise it will find |
323 | * a free name and use that instead, consistently. |
324 | * @param int $id |
325 | * @return string |
326 | */ |
327 | private function getUnreservedId( $id ) { |
328 | if ( !isset( $this->idMap[$id] ) ) { |
329 | // No conflict. Mark this id as being in use. |
330 | $this->idMap[$id] = true; |
331 | return (string)$id; |
332 | } elseif ( $this->idMap[$id] === true ) { |
333 | // This id has been used in the same source document. That's fine, nothing to do. |
334 | return (string)$id; |
335 | } elseif ( $this->idMap[$id] === false ) { |
336 | // This id has been used in a different source document, must remap. |
337 | $n = 2; |
338 | do { |
339 | $replacement = $id . '_' . $n++; |
340 | } while ( isset( $this->idMap[$replacement] ) ); |
341 | $this->idMap[$id] = $replacement; |
342 | $this->idMap[$replacement] = false; |
343 | return $replacement; |
344 | } else { |
345 | // This id has has already been remapped for the current source document. |
346 | return $this->idMap[$id]; |
347 | } |
348 | } |
349 | |
350 | /** |
351 | * Is $element a HTML heading (h1..h6) tag? |
352 | * @param Element $element |
353 | * @return bool |
354 | */ |
355 | private function isHeading( $element ) { |
356 | return in_array( $element->htmlName, [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ], true ); |
357 | } |
358 | |
359 | /** |
360 | * Check for prefix match. |
361 | * @param string $haystack |
362 | * @param string $needle |
363 | * @return bool |
364 | */ |
365 | private function startsWith( $haystack, $needle ) { |
366 | return substr_compare( $haystack, $needle, 0, strlen( $needle ) ) === 0; |
367 | } |
368 | |
369 | } |