Source: mobile.startup/PageHTMLParser.js

const
	Thumbnail = require( './Thumbnail' ),
	HEADING_SELECTOR = mw.config.get( 'wgMFMobileFormatterHeadings', [ 'h1', 'h2', 'h3', 'h4', 'h5' ] ).join( ',' ),
	EXCLUDE_THUMBNAIL_CLASS_SELECTORS = [ 'noviewer', 'metadata' ],
	NOT_SELECTORS = EXCLUDE_THUMBNAIL_CLASS_SELECTORS.map( ( excludeSelector ) => `:not(.${excludeSelector})` ).join( '' ),
	THUMB_SELECTOR = [ 'a.image', 'a.thumbimage, a.mw-file-description' ].map(
		( selector ) => `${selector}${NOT_SELECTORS}`
	).join( ',' );

class PageHTMLParser {
	/**
	 * @param {jQuery.Object} $container Used when parsing to find children within
	 * this container
	 */
	constructor( $container ) {
		this.$el = $container;

		// T220751: Cache headings as $el.find is a very expensive call.
		/** @private */
		this.$headings = this.$el.find( HEADING_SELECTOR );
	}

	/**
	 * Find the heading in the page.
	 * This has the benefit of excluding any additional h2s and h3s that may
	 * have been added programatically.
	 *
	 * @param {number} sectionIndex as defined by the PHP parser.
	 *  It should correspond to the section id
	 *  used in the edit link for the section.
	 *  Note, confusingly, this is different from section "ID" which is
	 * used in methods
	 * @return {jQuery.Object}
	 */
	findSectionHeadingByIndex( sectionIndex ) {
		if ( sectionIndex < 1 ) {
			// negative indexes will search from the end, which is behaviour we do not want.
			// return an empty set when this happens.
			// eslint-disable-next-line no-undef
			return $( [] );
		} else {
			return this.$headings
				// Headings must strictly be a child element of a section element
				// or the parser-output.
				// Not an ancestor!
				.filter( '.mw-parser-output > *, [class^="mf-section-"] > *' ).eq( sectionIndex - 1 );
		}
	}

	/**
	 * Finds all child elements that match the selector in a given section or subsection.
	 * Returns any direct child elements that match the selector,
	 * (i.e. searches only one level deep)
	 * as well as any elements that match the selector within those children.
	 * If the Page has no headings (e.g. a stub),
	 * then the search will target all nodes within the page.
	 *
	 * This code should work on desktop (PHP parser HTML)
	 * as well as mobile formatted HTML (PHP parser + MobileFormatter)
	 *
	 * @param {number} sectionIndex as defined by the PHP parser. It should correspond to
	 *  the section id used in the edit link for the section.
	 *  Note, confusingly, this is different from section "ID" which is
	 *  used in methods
	 * @param {string} selector to match
	 * @return {jQuery.Object}
	 */
	findChildInSectionLead( sectionIndex, selector ) {
		var $heading, $nextHeading, $el, $lead,
			headingSelector = HEADING_SELECTOR;

		function withNestedChildren( $matchingNodes ) {
			return $matchingNodes.find( selector ).addBack();
		}

		if ( sectionIndex === 0 ) {
			// lead is easy
			$lead = this.getLeadSectionElement();
			if ( $lead && $lead.length ) {
				return withNestedChildren( $lead.children( selector ) );
			} else {
				$heading = this.findSectionHeadingByIndex( 1 );
				return $heading.length ? withNestedChildren( $heading.prevAll( selector ) ) :
					// this page is a stub so search entire page
					this.$el.find( selector );
			}
		}

		// find heading associated with the section by looking at its
		// index position in the article
		// section ids relate to the element position in the page and the first heading
		// lead has been dealt with above, so first heading corresponds to section 1,
		// the first heading in the article.
		$heading = this.findSectionHeadingByIndex( sectionIndex );

		// If section-heading is present on the heading,
		// then we know the page has been MobileFormatted
		// and that this is a wrapped section
		if ( $heading.hasClass( 'section-heading' ) ) {
			// get content of section
			$el = $heading.next();
			// inside section find the first heading
			$nextHeading = $el.find( headingSelector ).eq( 0 );
			return $nextHeading.length ?
				// find all amboxes before the next heading
				withNestedChildren( $nextHeading.prevAll( selector ) ) :
				// There is no subheadings inside
				// Grab all issues in section
				withNestedChildren( $el.children( selector ) );
		} else {
			// the heading relates to a subsection (or unwrapped desktop section),
			// so grab elements between this and the next one
			$nextHeading = $heading.eq( 0 ).nextAll( headingSelector ).eq( 0 );
			return $heading.nextUntil( $nextHeading, selector );
		}
	}

	/**
	 * Get the lead section of the page view.
	 *
	 * @return {jQuery.Object|null}
	 */
	getLeadSectionElement() {
		/*
		 * The page is formatted as follows:
		 * <div id="bodyContent">
		 *   <!-- content of the page.. -->
		 *   <div id="mw-content-text">
		 *     <div class="mf-section-0">lead section</div>
		 *     <h2></h2>
		 *     <div class="mf-section-1">second section</div>
		 *   </div>
		 * </div>
		 */
		const $leadSection = this.$el.find( '.mf-section-0' );

		if ( $leadSection.length ) {
			return $leadSection;
		}
		// no lead section found
		return null;
	}

	/**
	 * Returns a Thumbnail object from an anchor element containing an image or
	 * null if not valid.
	 *
	 * @param {jQuery} $a Anchor element that contains the image.
	 * @return {Thumbnail|null}
	 */
	getThumbnail( $a ) {
		var
			notSelector = '.' + EXCLUDE_THUMBNAIL_CLASS_SELECTORS.join( ',.' ),
			$lazyImage = $a.find( '.lazy-image-placeholder' ),
			// Parents need to be checked as well.
			valid = $a.parents( notSelector ).length === 0 &&
					$a.find( notSelector ).length === 0,
			href = $a.attr( 'href' ),
			legacyMatch = href && href.match( /title=([^/&]+)/ ),
			match = href && href.match( /[^/]+$/ );

		// filter out invalid lazy loaded images if so far image is valid
		if ( $lazyImage.length && valid ) {
			// if the regex matches it means the image has one of the classes
			// thus we must invert the result
			valid = !new RegExp( '\\b(' + EXCLUDE_THUMBNAIL_CLASS_SELECTORS.join( '|' ) + ')\\b' )
				.test( $lazyImage.data( 'class' ) );
		}

		if ( valid && ( legacyMatch || match ) ) {
			return new Thumbnail( {
				el: $a,
				filename: mw.util.percentDecodeFragment(
					legacyMatch ? legacyMatch[1] : match[0]
				)
			} );
		}

		return null;
	}

	/**
	 * Return all the thumbnails in the article.
	 * Images which have a class or link container (.image|.thumbimage)
	 * that matches one of the items of the constant EXCLUDE_THUMBNAIL_CLASS_SELECTORS
	 * will be excluded.
	 * A thumbnail nested inside one of these classes will still be returned.
	 * e.g. `<div class="noviewer"><a class="image"><img></a></div>` is not a valid thumbnail
	 * `<a class="image noviewer"><img></a>` is not a valid thumbnail
	 * `<a class="image"><img class="noviewer"></a>` is not a valid thumbnail
	 *
	 * @param {jQuery} [$el] Container to search, defaults to this.$el.
	 * @return {Thumbnail[]}
	 */
	getThumbnails( $el ) {
		var
			self = this,
			$thumbs,
			thumbs = [];

		$el = $el || this.$el;

		$thumbs = $el.find( THUMB_SELECTOR );

		$thumbs.each( function () {
			var $a = $el.find( this );
			var thumb = self.getThumbnail( $a );

			if ( thumb ) {
				thumbs.push( thumb );
			}
		} );
		return thumbs;
	}

	/**
	 * Returns a jQuery object representing all redlinks on the page.
	 *
	 * @return {jQuery.Object}
	 */
	getRedLinks() {
		return this.$el.find( '.new' );
	}

	/**
	 * Returns an object consistent with MediaWiki API representing languages
	 * associated with the page in the user's current language.
	 *
	 * @param {string} pageTitle to fallback to if none found
	 * @return {Object} containing langlinks
	 *   and variant links as defined @ https://en.m.wikipedia.org/w/api.php?action=help&modules=query%2Blanglinks
	 */
	getLanguages( pageTitle ) {
		const mapLinkToLanguageObj = ( node ) => {
			const DELIMITER = ' – ';
			// Name of language (e.g. עברית for Hebrew)
			const autonym = node.textContent;
			// The name of the language in the current language
			// e.g. for english this would be Hebrew
			let langname;
			let title = node.getAttribute( 'title' ) || pageTitle;
			if ( title.indexOf( DELIMITER ) > -1 ) {
				title = title.split( DELIMITER );
				langname = title.pop();
				title = title.join( DELIMITER );
			}
			if ( !langname ) {
				langname = autonym;
			}
			return {
				lang: node.getAttribute( 'hreflang' ),
				autonym,
				langname,
				title,
				url: node.getAttribute( 'href' )
			};
		};
		return {
			languages: Array.prototype.map.call(
				document.querySelectorAll( '#p-lang .interlanguage-link a' ),
				mapLinkToLanguageObj
			),
			variants: Array.prototype.map.call(
				document.querySelectorAll( '#p-variants li a' ),
				mapLinkToLanguageObj
			)
		};
	}
}

/**
 * Selector for matching headings
 *
 * @memberof PageHTMLParser
 */
PageHTMLParser.HEADING_SELECTOR = HEADING_SELECTOR;

/**
 * Selector for thumbnails.
 *
 * @memberof PageHTMLParser
 */
PageHTMLParser.THUMB_SELECTOR = THUMB_SELECTOR;

module.exports = PageHTMLParser;