All files / mobile.startup PageHTMLParser.js

95.38% Statements 62/65
95.55% Branches 43/45
92.3% Functions 12/13
95.31% Lines 61/64
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283  
1x
1x
1x
2x
1x
2x
 
 
 
 
 
 
 
 
 
 
 
48x
 
 
 
48x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19x
 
 
 
 
 
19x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21x
 
 
8x
 
 
21x
 
4x
4x
 
 
2x
2x
 
 
 
2x
 
2x
2x
 
 
 
 
 
 
 
 
 
 
17x
 
 
 
 
17x
 
5x
 
5x
5x
 
 
 
 
 
 
 
 
12x
12x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4x
 
4x
2x
 
 
2x
 
 
 
 
 
 
 
 
 
 
11x
11x
11x
11x
11x
11x
 
 
11x
 
 
 
11x
 
 
6x
 
 
 
11x
6x
 
 
 
 
 
 
 
5x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9x
 
9x
 
9x
 
9x
7x
7x
 
7x
3x
 
 
9x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5x
 
39x
 
 
39x
39x
 
39x
 
 
 
 
 
 
 
5x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
 
 
 
 
1x
 
1x
  const
	Thumbnail = require( './Thumbnail' ),
	HEADING_SELECTOR = mw.config.get( 'wgMFMobileFormatterHeadings', [ 'h1', 'h2', 'h3', 'h4', 'h5' ] ).join( ',' ),
	EXCLUDE_THUMBNAIL_CLASS_SELECTORS = [ 'noviewer', 'metadata' ],
	NOT_SELECTORS = EXCLUDE_THUMBNAIL_CLASS_SELECTORS.map( ( excludeSelector ) => `:not(.${ excludeSelector })` ).join( '' ),
	THUMB_SELECTOR = [ 'a.image', 'a.thumbimage, a.mw-file-description' ].map(
		( selector ) => `${ selector }${ NOT_SELECTORS }`
	).join( ',' );
 
/**
 * Parses an article and converts it into a queriable object.
 */
class PageHTMLParser {
	/**
	 * @param {jQuery.Object} $container Used when parsing to find children within
	 * this container
	 */
	constructor( $container ) {
		this.$el = $container;
 
		// T220751: Cache headings as $el.find is a very expensive call.
		/** @private */
		this.$headings = this.$el.find( HEADING_SELECTOR );
	}
 
	/**
	 * Find the heading in the page.
	 * This has the benefit of excluding any additional h2s and h3s that may
	 * have been added programatically.
	 *
	 * @param {number} sectionIndex as defined by the PHP parser.
	 *  It should correspond to the section id
	 *  used in the edit link for the section.
	 *  Note, confusingly, this is different from section "ID" which is
	 * used in methods
	 * @return {jQuery.Object}
	 */
	findSectionHeadingByIndex( sectionIndex ) {
		Iif ( sectionIndex < 1 ) {
			// negative indexes will search from the end, which is behaviour we do not want.
			// return an empty set when this happens.
			// eslint-disable-next-line no-undef
			return $( [] );
		} else {
			return this.$headings
				// Headings must strictly be a child element of a section element
				// or the parser-output.
				// Not an ancestor!
				.filter( '.mw-parser-output > *, [class^="mf-section-"] > *' ).eq( sectionIndex - 1 );
		}
	}
 
	/**
	 * Finds all child elements that match the selector in a given section or subsection.
	 * Returns any direct child elements that match the selector,
	 * (i.e. searches only one level deep)
	 * as well as any elements that match the selector within those children.
	 * If the Page has no headings (e.g. a stub),
	 * then the search will target all nodes within the page.
	 *
	 * This code should work on desktop (PHP parser HTML)
	 * as well as mobile formatted HTML (PHP parser + MobileFormatter)
	 *
	 * @param {number} sectionIndex as defined by the PHP parser. It should correspond to
	 *  the section id used in the edit link for the section.
	 *  Note, confusingly, this is different from section "ID" which is
	 *  used in methods
	 * @param {string} selector to match
	 * @return {jQuery.Object}
	 */
	findChildInSectionLead( sectionIndex, selector ) {
		let $heading, $nextHeading;
 
		const headingSelector = HEADING_SELECTOR;
 
		function withNestedChildren( $matchingNodes ) {
			return $matchingNodes.find( selector ).addBack();
		}
 
		if ( sectionIndex === 0 ) {
			// lead is easy
			const $lead = this.getLeadSectionElement();
			if ( $lead && $lead.length ) {
 
				// Handle nested sections in Parsoid wikitext parset opt-in scenario.
				const $nestedSection = $lead.find( 'section[data-mw-section-id="0"]' );
				Iif ( $nestedSection.length ) {
					return withNestedChildren( $nestedSection.children( selector ) );
				}
 
				return withNestedChildren( $lead.children( selector ) );
			} else {
				$heading = this.findSectionHeadingByIndex( 1 );
				return $heading.length ? withNestedChildren( $heading.prevAll( selector ) ) :
					// this page is a stub so search entire page
					this.$el.find( selector );
			}
		}
 
		// find heading associated with the section by looking at its
		// index position in the article
		// section ids relate to the element position in the page and the first heading
		// lead has been dealt with above, so first heading corresponds to section 1,
		// the first heading in the article.
		$heading = this.findSectionHeadingByIndex( sectionIndex );
 
		// If section-heading is present on the heading,
		// then we know the page has been MobileFormatted
		// and that this is a wrapped section
		if ( $heading.hasClass( 'section-heading' ) ) {
			// get content of section
			const $el = $heading.next();
			// inside section find the first heading
			$nextHeading = $el.find( headingSelector ).eq( 0 );
			return $nextHeading.length ?
				// find all amboxes before the next heading
				withNestedChildren( $nextHeading.prevAll( selector ) ) :
				// There is no subheadings inside
				// Grab all issues in section
				withNestedChildren( $el.children( selector ) );
		} else {
			// the heading relates to a subsection (or unwrapped desktop section),
			// so grab elements between this and the next one
			$nextHeading = $heading.eq( 0 ).nextAll( headingSelector ).eq( 0 );
			return $heading.nextUntil( $nextHeading, selector );
		}
	}
 
	/**
	 * Get the lead section of the page view.
	 *
	 * @return {jQuery.Object|null}
	 */
	getLeadSectionElement() {
		/*
		 * The page is formatted as follows:
		 * <div id="bodyContent">
		 *   <!-- content of the page.. -->
		 *   <div id="mw-content-text">
		 *     <div class="mf-section-0">lead section</div>
		 *     <h2></h2>
		 *     <div class="mf-section-1">second section</div>
		 *   </div>
		 * </div>
		 */
		const $leadSection = this.$el.find( '.mf-section-0' );
 
		if ( $leadSection.length ) {
			return $leadSection;
		}
		// no lead section found
		return null;
	}
 
	/**
	 * Returns a Thumbnail object from an anchor element containing an image or
	 * null if not valid.
	 *
	 * @param {jQuery} $a Anchor element that contains the image.
	 * @return {Thumbnail|null}
	 */
	getThumbnail( $a ) {
		const notSelector = '.' + EXCLUDE_THUMBNAIL_CLASS_SELECTORS.join( ',.' ),
			$lazyImage = $a.find( '.lazy-image-placeholder' ),
			href = $a.attr( 'href' ),
			url = href && new URL( href, location.href ),
			legacyTitle = url && url.searchParams.get( 'title' ),
			match = url && url.pathname.match( /[^/]+$/ );
 
		// Parents need to be checked as well.
		let valid = $a.parents( notSelector ).length === 0 &&
			$a.find( notSelector ).length === 0;
 
		// filter out invalid lazy loaded images if so far image is valid
		if ( $lazyImage.length && valid ) {
			// if the regex matches it means the image has one of the classes
			// thus we must invert the result
			valid = !new RegExp( '\\b(' + EXCLUDE_THUMBNAIL_CLASS_SELECTORS.join( '|' ) + ')\\b' )
				.test( $lazyImage.data( 'class' ) );
		}
 
		if ( valid && ( legacyTitle !== null || match ) ) {
			return new Thumbnail( {
				el: $a,
				filename: mw.util.percentDecodeFragment(
					legacyTitle !== null ? legacyTitle : match[0]
				)
			} );
		}
 
		return null;
	}
 
	/**
	 * Return all the thumbnails in the article.
	 * Images which have a class or link container (.image|.thumbimage)
	 * that matches one of the items of the constant EXCLUDE_THUMBNAIL_CLASS_SELECTORS
	 * will be excluded.
	 * A thumbnail nested inside one of these classes will still be returned.
	 * e.g. `<div class="noviewer"><a class="image"><img></a></div>` is not a valid thumbnail
	 * `<a class="image noviewer"><img></a>` is not a valid thumbnail
	 * `<a class="image"><img class="noviewer"></a>` is not a valid thumbnail
	 *
	 * @param {jQuery} [$el] Container to search, defaults to this.$el.
	 * @return {Thumbnail[]}
	 */
	getThumbnails( $el ) {
		const thumbs = [];
 
		$el = $el || this.$el;
 
		const $thumbs = $el.find( THUMB_SELECTOR );
 
		$thumbs.each( ( i, thumbEl ) => {
			const $a = $el.find( thumbEl );
			const thumb = this.getThumbnail( $a );
 
			if ( thumb ) {
				thumbs.push( thumb );
			}
		} );
		return thumbs;
	}
 
	/**
	 * Returns a jQuery object representing all redlinks on the page.
	 *
	 * @return {jQuery.Object}
	 */
	getRedLinks() {
		return this.$el.find( '.new' );
	}
 
	/**
	 * Returns an object consistent with MediaWiki API representing languages
	 * associated with the page in the user's current language.
	 *
	 * @param {string} pageTitle to fallback to if none found
	 * @return {Object} containing langlinks
	 *   and variant links as defined @ https://en.m.wikipedia.org/w/api.php?action=help&modules=query%2Blanglinks
	 */
	getLanguages( pageTitle ) {
		const mapLinkToLanguageObj = ( node ) => {
			// Name of language (e.g. עברית for Hebrew)
			const autonym = node.textContent;
			// The name of the language in the current language
			// e.g. for english this would be Hebrew
			const langname = node.getAttribute( 'data-language-local-name' ) || autonym;
			const title = node.getAttribute( 'data-title' ) || pageTitle;
 
			return {
				lang: node.getAttribute( 'hreflang' ),
				autonym,
				langname,
				title,
				url: node.getAttribute( 'href' )
			};
		};
		return {
			languages: Array.prototype.map.call(
				document.querySelectorAll( '#p-lang .interlanguage-link a' ),
				mapLinkToLanguageObj
			),
			variants: Array.prototype.map.call(
				document.querySelectorAll( '#p-variants li a' ),
				mapLinkToLanguageObj
			)
		};
	}
}
 
/**
 * Selector for matching headings
 */
PageHTMLParser.HEADING_SELECTOR = HEADING_SELECTOR;
 
/**
 * Selector for thumbnails.
 */
PageHTMLParser.THUMB_SELECTOR = THUMB_SELECTOR;
 
module.exports = PageHTMLParser;