Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.81% |
91 / 94 |
|
75.00% |
9 / 12 |
CRAP | |
0.00% |
0 / 1 |
MoveLeadParagraphTransform | |
96.81% |
91 / 94 |
|
75.00% |
9 / 12 |
47 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
apply | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
matchElement | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
findParentWithParent | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
identifyInfoboxElement | |
95.00% |
19 / 20 |
|
0.00% |
0 / 1 |
7 | |||
identifyLeadParagraph | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
moveFirstParagraphBeforeInfobox | |
96.30% |
26 / 27 |
|
0.00% |
0 / 1 |
13 | |||
isNotEmptyNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isNonLeadParagraph | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
6 | |||
isPreviousSibling | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
hasNoNonEmptyPrecedingParagraphs | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
logInfoboxesWrappedInContainers | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace MobileFrontend\Transforms; |
4 | |
5 | use DOMDocument; |
6 | use DOMElement; |
7 | use DOMNode; |
8 | use DOMXPath; |
9 | use MediaWiki\MediaWikiServices; |
10 | use MediaWiki\Title\Title; |
11 | use Wikimedia\Parsoid\Utils\DOMCompat; |
12 | |
13 | class MoveLeadParagraphTransform implements IMobileTransform { |
14 | /** |
15 | * @var Title|string |
16 | */ |
17 | private $title; |
18 | |
19 | /** |
20 | * @var int |
21 | */ |
22 | private $revId; |
23 | |
24 | /** |
25 | * @param Title|string $title for logging purposes |
26 | * @param int $revId for logging purposes |
27 | */ |
28 | public function __construct( $title, $revId ) { |
29 | $this->title = $title; |
30 | $this->revId = $revId; |
31 | } |
32 | |
33 | /** |
34 | * Rearranges content so that text in the lead paragraph is prioritised to appear |
35 | * before the infobox. Lead |
36 | * |
37 | * @param DOMElement $node to be transformed |
38 | */ |
39 | public function apply( DOMElement $node ) { |
40 | $section = DOMCompat::querySelector( $node, 'section' ); |
41 | if ( $section ) { |
42 | $this->moveFirstParagraphBeforeInfobox( $section, $section->ownerDocument ); |
43 | } |
44 | } |
45 | |
46 | /** |
47 | * Helper function to verify that passed $node matched tagName and has set required classname |
48 | * @param DOMElement $node Node to verify |
49 | * @param string|bool $requiredTagName Required tag name, has to be lowercase |
50 | * if false it is ignored and requiredClass is used. |
51 | * @param string $requiredClass Regular expression with required class name |
52 | * @return bool |
53 | */ |
54 | private static function matchElement( DOMElement $node, $requiredTagName, $requiredClass ) { |
55 | $classes = explode( ' ', $node->getAttribute( 'class' ) ); |
56 | return ( $requiredTagName === false || strtolower( $node->tagName ) === $requiredTagName ) |
57 | && preg_grep( $requiredClass, $classes ); |
58 | } |
59 | |
60 | /** |
61 | * Iterate up the DOM tree until find a parent node which has the parent $parent |
62 | * @param DOMNode $node |
63 | * @param DOMNode $parent |
64 | * @return DOMNode representing a node which is either $node or an ancestor of $node which |
65 | * has a parent $parent. Note, it is assumed that $node will always be a descendent of $parent so |
66 | * if this is not true, you probably shouldn't be using this function and I, as the writer of this |
67 | * code cannot be held responsible for portals that open to another dimension or your laptop |
68 | * setting on fire. |
69 | */ |
70 | private static function findParentWithParent( $node, $parent ) { |
71 | $search = $node; |
72 | while ( $search->parentNode && !$search->parentNode->isSameNode( $parent ) ) { |
73 | $search = $search->parentNode; |
74 | } |
75 | return $search; |
76 | } |
77 | |
78 | /** |
79 | * Extract the first infobox in document |
80 | * @param DOMXPath $xPath XPath object to execute the query |
81 | * @param DOMElement $section Where to search for an infobox |
82 | * @return DOMElement|null The first infobox |
83 | */ |
84 | private function identifyInfoboxElement( DOMXPath $xPath, DOMElement $section ): ?DOMElement { |
85 | $paths = [ |
86 | // Infoboxes: *.infobox |
87 | './/*[contains(concat(" ",normalize-space(@class)," ")," infobox ")]', |
88 | // Thumbnail images: .thumb, figure (Parsoid) |
89 | './/*[contains(concat(" ",normalize-space(@class)," ")," thumb ")]', |
90 | './/figure', |
91 | ]; |
92 | $query = '(' . implode( '|', $paths ) . ')'; |
93 | $infobox = $xPath->query( $query, $section )->item( 0 ); |
94 | |
95 | if ( $infobox instanceof DOMElement ) { |
96 | // Check if the infobox is inside a container |
97 | $node = $infobox; |
98 | $wrapperClass = '/^(mw-stack|collapsible)$/'; |
99 | // Traverse up |
100 | while ( $node->parentNode ) { |
101 | if ( self::matchElement( $node, false, $wrapperClass ) ) { |
102 | $infobox = $node; |
103 | } |
104 | $node = $node->parentNode; |
105 | } |
106 | // For images, include any containers. |
107 | // We don't need to check if the parent is an infobox, because it |
108 | // would've matched first in the XPath query. |
109 | if ( |
110 | strtolower( $infobox->tagName ) === 'figure' || |
111 | strpos( $infobox->getAttribute( 'class' ), 'thumb' ) !== false |
112 | ) { |
113 | while ( $infobox->parentNode !== $section ) { |
114 | $infobox = $infobox->parentNode; |
115 | } |
116 | } |
117 | return $infobox; |
118 | } |
119 | return null; |
120 | } |
121 | |
122 | /** |
123 | * Find first paragraph that has text content, i.e. paragraphs that are not empty |
124 | * This function will also filter out the paragraphs that have nodes containing whitespaces |
125 | * only. |
126 | * example: `<p> <span> </span> </p>` is not a lead paragraph |
127 | * |
128 | * Keep in sync with mobile.init/identifyLeadParagraph.js. |
129 | * |
130 | * @param DOMXPath $xPath XPath object to execute the query |
131 | * @param DOMElement $section Where to search for paragraphs |
132 | * @return DOMElement|null The lead paragraph |
133 | */ |
134 | private function identifyLeadParagraph( DOMXPath $xPath, DOMElement $section ): ?DOMElement { |
135 | $paragraphs = $xPath->query( './p', $section ); |
136 | |
137 | $index = 0; |
138 | while ( $index < $paragraphs->length ) { |
139 | $node = $paragraphs->item( $index ); |
140 | if ( $node && !$this->isNonLeadParagraph( $xPath, $node ) ) { |
141 | /** @phan-suppress-next-line PhanTypeMismatchReturn DOMNode vs. DOMElement */ |
142 | return $node; |
143 | } |
144 | |
145 | ++$index; |
146 | } |
147 | return null; |
148 | } |
149 | |
150 | /** |
151 | * Move the first paragraph in the lead section above the infobox |
152 | * |
153 | * In order for a paragraph to be moved the following conditions must be met: |
154 | * - the lead section contains at least one infobox; |
155 | * - the paragraph doesn't already appear before the first infobox |
156 | * if any in the DOM; |
157 | * - the paragraph contains visible text content |
158 | * - article belongs to the MAIN namespace |
159 | * |
160 | * Additionally if paragraph immediate sibling is a list (ol or ul element), the list |
161 | * is also moved along with paragraph above infobox. |
162 | * |
163 | * Note that the first paragraph is not moved before hatnotes, or mbox or other |
164 | * elements that are not infoboxes. |
165 | * |
166 | * @param DOMElement $leadSection |
167 | * @param ?DOMDocument $doc Document to which the section belongs |
168 | */ |
169 | private function moveFirstParagraphBeforeInfobox( DOMElement $leadSection, ?DOMDocument $doc ) { |
170 | if ( $doc === null ) { |
171 | return; |
172 | } |
173 | $xPath = new DOMXPath( $doc ); |
174 | $infobox = $this->identifyInfoboxElement( $xPath, $leadSection ); |
175 | |
176 | if ( $infobox ) { |
177 | $leadParagraph = $this->identifyLeadParagraph( $xPath, $leadSection ); |
178 | $isTopLevelInfobox = $infobox->parentNode->isSameNode( $leadSection ); |
179 | |
180 | if ( $leadParagraph && $isTopLevelInfobox && |
181 | $this->isPreviousSibling( $infobox, $leadParagraph ) |
182 | ) { |
183 | $listElementAfterParagraph = null; |
184 | $where = $infobox; |
185 | |
186 | $elementAfterParagraphQuery = $xPath->query( 'following-sibling::*[1]', $leadParagraph ); |
187 | if ( $elementAfterParagraphQuery->length > 0 ) { |
188 | $elem = $elementAfterParagraphQuery->item( 0 ); |
189 | /** @phan-suppress-next-line PhanUndeclaredProperty DOMNode vs. DOMElement */ |
190 | if ( $elem->tagName === 'ol' || $elem->tagName === 'ul' ) { |
191 | $listElementAfterParagraph = $elem; |
192 | } |
193 | } |
194 | |
195 | $leadSection->insertBefore( $leadParagraph, $where ); |
196 | if ( $listElementAfterParagraph !== null ) { |
197 | $leadSection->insertBefore( $listElementAfterParagraph, $where ); |
198 | } |
199 | } elseif ( !$isTopLevelInfobox ) { |
200 | $isInWrongPlace = $this->hasNoNonEmptyPrecedingParagraphs( $xPath, |
201 | /** @phan-suppress-next-line PhanTypeMismatchArgumentSuperType DOMNode vs. DOMElement */ |
202 | self::findParentWithParent( $infobox, $leadSection ) |
203 | ); |
204 | $loggingEnabled = MediaWikiServices::getInstance() |
205 | ->getService( 'MobileFrontend.Config' )->get( 'MFLogWrappedInfoboxes' ); |
206 | /** |
207 | * @see https://phabricator.wikimedia.org/T149884 |
208 | * @todo remove after research is done |
209 | */ |
210 | if ( $isInWrongPlace && $loggingEnabled ) { |
211 | $this->logInfoboxesWrappedInContainers(); |
212 | } |
213 | } |
214 | } |
215 | } |
216 | |
217 | /** |
218 | * Check if the node contains any non-whitespace characters |
219 | * |
220 | * Keep in sync with mobile.init/identifyLeadParagraph.js. |
221 | * |
222 | * @param DOMNode $node |
223 | * @return bool |
224 | */ |
225 | private function isNotEmptyNode( DOMNode $node ) { |
226 | return (bool)preg_match( '/\S/', $node->textContent ); |
227 | } |
228 | |
229 | /** |
230 | * Checks if paragraph contains visible content and so |
231 | * could be considered the lead paragraph of the aricle. |
232 | * |
233 | * Keep in sync with mobile.init/identifyLeadParagraph.js. |
234 | * |
235 | * @param DOMXPath $xPath An XPath query |
236 | * @param DOMNode $node DOM Node to verify |
237 | * @return bool |
238 | */ |
239 | private function isNonLeadParagraph( $xPath, $node ) { |
240 | if ( |
241 | $node->nodeType === XML_ELEMENT_NODE && |
242 | /** @phan-suppress-next-line PhanUndeclaredProperty DOMNode vs. DOMElement */ |
243 | $node->tagName === 'p' && |
244 | $this->isNotEmptyNode( $node ) |
245 | ) { |
246 | // Clone the node so we can modifiy it |
247 | $node = $node->cloneNode( true ); |
248 | |
249 | // Remove any TemplateStyle tags, or coordinate wrappers... |
250 | $templateStyles = $xPath->query( '(.//style|.//span[@id="coordinates"])', $node ); |
251 | foreach ( $templateStyles as $style ) { |
252 | $style->parentNode->removeChild( $style ); |
253 | } |
254 | // ...and check again for emptiness |
255 | if ( !$this->isNotEmptyNode( $node ) ) { |
256 | return true; |
257 | } |
258 | |
259 | return false; |
260 | } |
261 | return true; |
262 | } |
263 | |
264 | /** |
265 | * Check if the $first is previous sibling of $second |
266 | * |
267 | * Both nodes ($first and $second) most probably will be located in the beginning of |
268 | * article, because of that it's better to loop backward from $second to $first. |
269 | * Usually those two elements should be in order, it means that we will do only one |
270 | * `isSameNode()` check. If those elements are not in the order, we will quickly get to |
271 | * $node->previousSibling==null and return false instead of the whole traversing document. |
272 | * |
273 | * @param DOMNode $first |
274 | * @param DOMNode $second |
275 | * @return bool |
276 | */ |
277 | private function isPreviousSibling( DOMNode $first, DOMNode $second ) { |
278 | $node = $second->previousSibling; |
279 | while ( $node !== null ) { |
280 | if ( $node->isSameNode( $first ) ) { |
281 | return true; |
282 | } |
283 | $node = $node->previousSibling; |
284 | } |
285 | return false; |
286 | } |
287 | |
288 | /** |
289 | * Check if there are any non-empty siblings before $element |
290 | * |
291 | * @param DOMXPath $xPath |
292 | * @param DOMElement $element |
293 | * @return bool |
294 | */ |
295 | private function hasNoNonEmptyPrecedingParagraphs( DOMXPath $xPath, DOMElement $element ) { |
296 | $node = $element->previousSibling; |
297 | while ( $node !== null ) { |
298 | if ( !$this->isNonLeadParagraph( $xPath, $node ) ) { |
299 | return false; |
300 | } |
301 | $node = $node->previousSibling; |
302 | } |
303 | return true; |
304 | } |
305 | |
306 | /** |
307 | * Finds all infoboxes which are one or more levels deep in $xPath content. When at least one |
308 | * element is found - log the page title and revision |
309 | * |
310 | * @see https://phabricator.wikimedia.org/T149884 |
311 | */ |
312 | private function logInfoboxesWrappedInContainers() { |
313 | \MediaWiki\Logger\LoggerFactory::getInstance( 'mobile' )->info( |
314 | "Found infobox wrapped with container on {$this->title} (rev:{$this->revId})" |
315 | ); |
316 | } |
317 | } |