Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
96.59% |
85 / 88 |
|
72.73% |
8 / 11 |
CRAP | |
0.00% |
0 / 1 |
| MoveLeadParagraphTransform | |
96.59% |
85 / 88 |
|
72.73% |
8 / 11 |
44 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| apply | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| findParentWithParent | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
| identifyInfoboxElement | |
94.44% |
17 / 18 |
|
0.00% |
0 / 1 |
8.01 | |||
| identifyLeadParagraph | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| moveFirstParagraphBeforeInfobox | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
13 | |||
| isNotEmptyNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| isNonLeadParagraph | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
6 | |||
| isPreviousSibling | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| hasNoNonEmptyPrecedingParagraphs | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
| logInfoboxesWrappedInContainers | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MobileFrontend\Transforms; |
| 4 | |
| 5 | use MediaWiki\MediaWikiServices; |
| 6 | use MediaWiki\Title\Title; |
| 7 | use Wikimedia\Parsoid\DOM\Document; |
| 8 | use Wikimedia\Parsoid\DOM\Element; |
| 9 | use Wikimedia\Parsoid\DOM\Node; |
| 10 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 11 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 12 | |
| 13 | class MoveLeadParagraphTransform implements IMobileTransform { |
| 14 | /** |
| 15 | * @var Title|string |
| 16 | */ |
| 17 | private $title; |
| 18 | |
| 19 | /** |
| 20 | * @param Title|string $title for logging purposes |
| 21 | * @param int $revId for logging purposes |
| 22 | */ |
| 23 | public function __construct( |
| 24 | $title, |
| 25 | private readonly int $revId, |
| 26 | ) { |
| 27 | $this->title = $title; |
| 28 | } |
| 29 | |
| 30 | /** |
| 31 | * Rearranges content so that text in the lead paragraph is prioritised to appear |
| 32 | * before the infobox. Lead |
| 33 | * |
| 34 | * @param Element $node to be transformed |
| 35 | */ |
| 36 | public function apply( Element $node ) { |
| 37 | $section = DOMCompat::querySelector( $node, 'section' ); |
| 38 | if ( $section ) { |
| 39 | $this->moveFirstParagraphBeforeInfobox( $section, $section->ownerDocument ); |
| 40 | } |
| 41 | } |
| 42 | |
| 43 | /** |
| 44 | * Iterate up the DOM tree until find a parent node which has the parent $parent |
| 45 | * @param Node $node |
| 46 | * @param Node $parent |
| 47 | * @return Node representing a node which is either $node or an ancestor of $node which |
| 48 | * has a parent $parent. Note, it is assumed that $node will always be a descendent of $parent so |
| 49 | * if this is not true, you probably shouldn't be using this function and I, as the writer of this |
| 50 | * code cannot be held responsible for portals that open to another dimension or your laptop |
| 51 | * setting on fire. |
| 52 | */ |
| 53 | private static function findParentWithParent( $node, $parent ) { |
| 54 | $search = $node; |
| 55 | while ( $search->parentNode && !$search->parentNode->isSameNode( $parent ) ) { |
| 56 | $search = $search->parentNode; |
| 57 | } |
| 58 | // @var Node $search We assert this will always find a parent. |
| 59 | '@phan-var Node $search'; |
| 60 | return $search; |
| 61 | } |
| 62 | |
| 63 | /** |
| 64 | * Extract the first infobox in document |
| 65 | * @param Element $section Where to search for an infobox |
| 66 | * @return Element|null The first infobox |
| 67 | */ |
| 68 | private function identifyInfoboxElement( Element $section ): ?Element { |
| 69 | $infobox = DOMCompat::querySelector( |
| 70 | $section, |
| 71 | // Infoboxes are .infobox, and thumbnail images are .thumb, figure |
| 72 | '.infobox, .thumb, figure' |
| 73 | ); |
| 74 | |
| 75 | if ( $infobox instanceof Element ) { |
| 76 | // Check if the infobox is inside a container |
| 77 | $node = $infobox; |
| 78 | // Traverse up |
| 79 | while ( $node->parentNode ) { |
| 80 | $classList = DOMCompat::getClassList( $node ); |
| 81 | if ( $classList->contains( 'mw-stack' ) || |
| 82 | $classList->contains( 'collapsible' ) ) { |
| 83 | $infobox = $node; |
| 84 | } |
| 85 | $node = $node->parentNode; |
| 86 | } |
| 87 | // For images, include any containers. |
| 88 | // We don't need to check if the parent is an infobox, because it |
| 89 | // would've matched first in the selector query. |
| 90 | if ( |
| 91 | DOMUtils::nodeName( $infobox ) === 'figure' || |
| 92 | DOMCompat::getClassList( $infobox )->contains( 'thumb' ) |
| 93 | ) { |
| 94 | while ( $infobox->parentNode !== $section ) { |
| 95 | $infobox = $infobox->parentNode; |
| 96 | } |
| 97 | } |
| 98 | return $infobox; |
| 99 | } |
| 100 | return null; |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * Find first paragraph that has text content, i.e. paragraphs that are not empty |
| 105 | * This function will also filter out the paragraphs that have nodes containing whitespaces |
| 106 | * only. |
| 107 | * example: `<p> <span> </span> </p>` is not a lead paragraph |
| 108 | * |
| 109 | * Keep in sync with mobile.editor.overlay/identifyLeadParagraph.js. |
| 110 | * |
| 111 | * @param Element $section Where to search for paragraphs |
| 112 | * @return Element|null The lead paragraph |
| 113 | */ |
| 114 | private function identifyLeadParagraph( Element $section ): ?Element { |
| 115 | $paragraphs = DOMCompat::querySelectorAll( $section, ':scope > p' ); |
| 116 | foreach ( $paragraphs as $node ) { |
| 117 | if ( !$this->isNonLeadParagraph( $node ) ) { |
| 118 | return $node; |
| 119 | } |
| 120 | } |
| 121 | return null; |
| 122 | } |
| 123 | |
| 124 | /** |
| 125 | * Move the first paragraph in the lead section above the infobox |
| 126 | * |
| 127 | * In order for a paragraph to be moved the following conditions must be met: |
| 128 | * - the lead section contains at least one infobox; |
| 129 | * - the paragraph doesn't already appear before the first infobox |
| 130 | * if any in the DOM; |
| 131 | * - the paragraph contains visible text content |
| 132 | * - article belongs to the MAIN namespace |
| 133 | * |
| 134 | * Additionally if paragraph immediate sibling is a list (ol or ul element), the list |
| 135 | * is also moved along with paragraph above infobox. |
| 136 | * |
| 137 | * Note that the first paragraph is not moved before hatnotes, or mbox or other |
| 138 | * elements that are not infoboxes. |
| 139 | * |
| 140 | * @param Element $leadSection |
| 141 | * @param ?Document $doc Document to which the section belongs |
| 142 | */ |
| 143 | private function moveFirstParagraphBeforeInfobox( Element $leadSection, ?Document $doc ) { |
| 144 | if ( $doc === null ) { |
| 145 | return; |
| 146 | } |
| 147 | $infobox = $this->identifyInfoboxElement( $leadSection ); |
| 148 | |
| 149 | if ( $infobox ) { |
| 150 | $leadParagraph = $this->identifyLeadParagraph( $leadSection ); |
| 151 | $isTopLevelInfobox = $infobox->parentNode->isSameNode( $leadSection ); |
| 152 | |
| 153 | if ( $leadParagraph && $isTopLevelInfobox && |
| 154 | $this->isPreviousSibling( $infobox, $leadParagraph ) |
| 155 | ) { |
| 156 | $listElementAfterParagraph = null; |
| 157 | $where = $infobox; |
| 158 | |
| 159 | $elementAfterParagraph = DOMCompat::getNextElementSibling( |
| 160 | $leadParagraph |
| 161 | ); |
| 162 | if ( $elementAfterParagraph ) { |
| 163 | $nodeName = DOMUtils::nodeName( $elementAfterParagraph ); |
| 164 | if ( $nodeName === 'ol' || $nodeName === 'ul' ) { |
| 165 | $listElementAfterParagraph = $elementAfterParagraph; |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | $leadSection->insertBefore( $leadParagraph, $where ); |
| 170 | if ( $listElementAfterParagraph !== null ) { |
| 171 | $leadSection->insertBefore( $listElementAfterParagraph, $where ); |
| 172 | } |
| 173 | } elseif ( !$isTopLevelInfobox ) { |
| 174 | $isInWrongPlace = $this->hasNoNonEmptyPrecedingParagraphs( |
| 175 | /** @phan-suppress-next-line PhanTypeMismatchArgumentSuperType Node vs. Element */ |
| 176 | self::findParentWithParent( $infobox, $leadSection ) |
| 177 | ); |
| 178 | $loggingEnabled = MediaWikiServices::getInstance() |
| 179 | ->getService( 'MobileFrontend.Config' )->get( 'MFLogWrappedInfoboxes' ); |
| 180 | /** |
| 181 | * @see https://phabricator.wikimedia.org/T149884 |
| 182 | * @todo remove after research is done |
| 183 | */ |
| 184 | if ( $isInWrongPlace && $loggingEnabled ) { |
| 185 | $this->logInfoboxesWrappedInContainers(); |
| 186 | } |
| 187 | } |
| 188 | } |
| 189 | } |
| 190 | |
| 191 | /** |
| 192 | * Check if the node contains any non-whitespace characters |
| 193 | * |
| 194 | * Keep in sync with mobile.init/identifyLeadParagraph.js. |
| 195 | * |
| 196 | * @param Node $node |
| 197 | * @return bool |
| 198 | */ |
| 199 | private function isNotEmptyNode( Node $node ) { |
| 200 | return (bool)preg_match( '/\S/', $node->textContent ?? '' ); |
| 201 | } |
| 202 | |
| 203 | /** |
| 204 | * Checks if paragraph contains visible content and so |
| 205 | * could be considered the lead paragraph of the aricle. |
| 206 | * |
| 207 | * Keep in sync with mobile.init/identifyLeadParagraph.js. |
| 208 | * |
| 209 | * @param Node $node DOM Node to verify |
| 210 | * @return bool |
| 211 | */ |
| 212 | private function isNonLeadParagraph( $node ) { |
| 213 | if ( |
| 214 | $node->nodeType === XML_ELEMENT_NODE && |
| 215 | DOMUtils::nodeName( $node ) === 'p' && |
| 216 | $this->isNotEmptyNode( $node ) |
| 217 | ) { |
| 218 | // Clone the node so we can modifiy it |
| 219 | $node = $node->cloneNode( true ); |
| 220 | // @var Element $node |
| 221 | '@phan-var Element $node'; |
| 222 | |
| 223 | // Remove any TemplateStyle tags, or coordinate wrappers... |
| 224 | $templateStyles = DOMCompat::querySelectorAll( |
| 225 | $node, 'style, span#coordinates' |
| 226 | ); |
| 227 | foreach ( $templateStyles as $style ) { |
| 228 | $style->parentNode->removeChild( $style ); |
| 229 | } |
| 230 | // ...and check again for emptiness |
| 231 | if ( !$this->isNotEmptyNode( $node ) ) { |
| 232 | return true; |
| 233 | } |
| 234 | |
| 235 | return false; |
| 236 | } |
| 237 | return true; |
| 238 | } |
| 239 | |
| 240 | /** |
| 241 | * Check if the $first is previous sibling of $second |
| 242 | * |
| 243 | * Both nodes ($first and $second) most probably will be located in the beginning of |
| 244 | * article, because of that it's better to loop backward from $second to $first. |
| 245 | * Usually those two elements should be in order, it means that we will do only one |
| 246 | * `isSameNode()` check. If those elements are not in the order, we will quickly get to |
| 247 | * $node->previousSibling==null and return false instead of the whole traversing document. |
| 248 | * |
| 249 | * @param Node $first |
| 250 | * @param Node $second |
| 251 | * @return bool |
| 252 | */ |
| 253 | private function isPreviousSibling( Node $first, Node $second ) { |
| 254 | $node = $second->previousSibling; |
| 255 | while ( $node !== null ) { |
| 256 | if ( $node->isSameNode( $first ) ) { |
| 257 | return true; |
| 258 | } |
| 259 | $node = $node->previousSibling; |
| 260 | } |
| 261 | return false; |
| 262 | } |
| 263 | |
| 264 | /** |
| 265 | * Check if there are any non-empty siblings before $element |
| 266 | * |
| 267 | * @param Element $element |
| 268 | * @return bool |
| 269 | */ |
| 270 | private function hasNoNonEmptyPrecedingParagraphs( Element $element ) { |
| 271 | $node = $element->previousSibling; |
| 272 | while ( $node !== null ) { |
| 273 | if ( !$this->isNonLeadParagraph( $node ) ) { |
| 274 | return false; |
| 275 | } |
| 276 | $node = $node->previousSibling; |
| 277 | } |
| 278 | return true; |
| 279 | } |
| 280 | |
| 281 | /** |
| 282 | * Finds all infoboxes which are one or more levels deep in content. When at least one |
| 283 | * element is found - log the page title and revision |
| 284 | * |
| 285 | * @see https://phabricator.wikimedia.org/T149884 |
| 286 | */ |
| 287 | private function logInfoboxesWrappedInContainers() { |
| 288 | \MediaWiki\Logger\LoggerFactory::getInstance( 'mobile' )->info( |
| 289 | "Found infobox wrapped with container on {$this->title} (rev:{$this->revId})" |
| 290 | ); |
| 291 | } |
| 292 | } |