All files unicodejs.js

100% Statements 74/74
100% Branches 44/44
100% Functions 7/7
100% Lines 72/72

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222              1x                           1x 16472x                 1x 12547x                       8724x                           3618x 1002x   2616x 2616x 1244x   1372x                                                             762x 762x       762x 762x     762x 762x     762x 744x     18x         18x   18x     18x 8x   18x 16x   18x 16x   18x                           1x 64x 64x 64x   64x 4085x     4085x 1950x 1406x 2x   1404x 1404x     544x 1x     543x 543x     543x 543x         2135x 2135x 2135x 1x   2134x 1x     2133x 4x     2129x   1367x 762x   5x 5x     757x       2129x 1123x 1123x 1123x 1123x         55x 13x 42x 34x   55x      
/*!
 * UnicodeJS namespace
 *
 * @copyright 2013-2018 UnicodeJS team and others; see AUTHORS.txt
 * @license The MIT License (MIT); see LICENSE.txt
 */
 
( function () {
 
	/**
	 * Namespace for all UnicodeJS classes, static methods and static properties.
	 *
	 * @namespace unicodeJS
	 */
 
	/**
	 * Check if a code unit is a the leading half of a surrogate pair
	 *
	 * @param {string} unit Code unit
	 * @return {boolean}
	 */
	unicodeJS.isLeadingSurrogate = function ( unit ) {
		return unit && unit.match( /^[\uD800-\uDBFF]$/ );
	};
 
	/**
	 * Check if a code unit is a the trailing half of a surrogate pair
	 *
	 * @param {string} unit Code unit
	 * @return {boolean}
	 */
	unicodeJS.isTrailingSurrogate = function ( unit ) {
		return unit && unit.match( /^[\uDC00-\uDFFF]$/ );
	};
 
	/**
	 * Write a UTF-16 code unit as a javascript string literal.
	 *
	 * @memberof unicodeJS
	 * @private
	 * @param {number} codeUnit integer between 0x0000 and 0xFFFF
	 * @return {string} String literal ('\u' followed by 4 hex digits)
	 */
	function uEsc( codeUnit ) {
		return '\\u' + ( codeUnit + 0x10000 ).toString( 16 ).slice( -4 );
	}
 
	/**
	 * Return a regexp string for the code unit range min-max
	 *
	 * @memberof unicodeJS
	 * @private
	 * @param {number} min the minimum code unit in the range.
	 * @param {number} max the maximum code unit in the range.
	 * @param {boolean} [bracket] If true, then wrap range in [ ... ]
	 * @return {string} Regexp string which matches the range
	 */
	function codeUnitRange( min, max, bracket ) {
		if ( min === max ) { // single code unit: never bracket
			return uEsc( min );
		}
		var value = uEsc( min ) + '-' + uEsc( max );
		if ( bracket ) {
			return '[' + value + ']';
		} else {
			return value;
		}
	}
 
	/**
	 * Get a list of boxes in hi-lo surrogate space, corresponding to the given character range
	 *
	 * A box {hi: [x, y], lo: [z, w]} represents a regex [x-y][z-w] to match a surrogate pair
	 *
	 * Suppose ch1 and ch2 have surrogate pairs (hi1, lo1) and (hi2, lo2).
	 * Then the range of chars from ch1 to ch2 can be represented as the
	 * disjunction of three code unit ranges:
	 *
	 *     [hi1 - hi1][lo1 - 0xDFFF]
	 *      |
	 *     [hi1+1 - hi2-1][0xDC00 - 0xDFFF]
	 *      |
	 *     [hi2 - hi2][0xD800 - lo2]
	 *
	 * Often the notation can be optimised (e.g. when hi1 == hi2).
	 *
	 * @memberof unicodeJS
	 * @private
	 * @param {number} ch1 The min character of the range; must be over 0xFFFF
	 * @param {number} ch2 The max character of the range; must be at least ch1
	 * @return {Array.<Object>} A list of boxes where each box is an object with two properties: 'hi' and 'lo'.
	 *  'hi' is an array of two numbers representing the range of the high surrogate.
	 *  'lo' is an array of two numbers representing the range of the low surrogate.
	 */
	function getCodeUnitBoxes( ch1, ch2 ) {
 
		var loMin = 0xDC00;
		var loMax = 0xDFFF;
 
		// hi and lo surrogates for ch1
		/* eslint-disable no-bitwise */
		var hi1 = 0xD800 + ( ( ch1 - 0x10000 ) >> 10 );
		var lo1 = 0xDC00 + ( ( ch1 - 0x10000 ) & 0x3FF );
 
		// hi and lo surrogates for ch2
		var hi2 = 0xD800 + ( ( ch2 - 0x10000 ) >> 10 );
		var lo2 = 0xDC00 + ( ( ch2 - 0x10000 ) & 0x3FF );
		/* eslint-enable no-bitwise */
 
		if ( hi1 === hi2 ) {
			return [ { hi: [ hi1, hi2 ], lo: [ lo1, lo2 ] } ];
		}
 
		var boxes = [];
 
		/* eslint-disable no-bitwise */
 
		// minimum hi surrogate which only represents characters >= ch1
		var hiMinAbove = 0xD800 + ( ( ch1 - 0x10000 + 0x3FF ) >> 10 );
		// maximum hi surrogate which only represents characters <= ch2
		var hiMaxBelow = 0xD800 + ( ( ch2 - 0x10000 - 0x3FF ) >> 10 );
		/* eslint-enable no-bitwise */
 
		if ( hi1 < hiMinAbove ) {
			boxes.push( { hi: [ hi1, hi1 ], lo: [ lo1, loMax ] } );
		}
		if ( hiMinAbove <= hiMaxBelow ) {
			boxes.push( { hi: [ hiMinAbove, hiMaxBelow ], lo: [ loMin, loMax ] } );
		}
		if ( hiMaxBelow < hi2 ) {
			boxes.push( { hi: [ hi2, hi2 ], lo: [ loMin, lo2 ] } );
		}
		return boxes;
	}
 
	/**
	 * Make a regexp string for an array of Unicode character ranges.
	 *
	 * If either character in a range is above 0xFFFF, then the range will
	 * be encoded as multiple surrogate pair ranges. It is an error for a
	 * range to overlap with the surrogate range 0xD800-0xDFFF (as this would
	 * only match ill-formed strings).
	 *
	 * @param {Array} ranges Array of ranges, each of which is a character or an interval
	 * @return {string} Regexp string for the disjunction of the ranges.
	 */
	unicodeJS.charRangeArrayRegexp = function ( ranges ) {
		var boxes = [],
			characterClass = [], // list of (\uXXXX code unit or interval), for BMP
			disjunction = []; // list of regex strings, to be joined with '|'
 
		for ( var i = 0; i < ranges.length; i++ ) {
			var range = ranges[ i ];
 
			// Handle single code unit
			if ( typeof range === 'number' ) {
				if ( range <= 0xFFFF ) {
					if ( range >= 0xD800 && range <= 0xDFFF ) {
						throw new Error( 'Surrogate: ' + range.toString( 16 ) );
					}
					characterClass.push( uEsc( range ) );
					continue;
				} else {
					// Handle single surrogate pair
					if ( range > 0x10FFFF ) {
						throw new Error( 'Character code too high: ' + range.toString( 16 ) );
					}
					/* eslint-disable no-bitwise */
					var hi = 0xD800 + ( ( range - 0x10000 ) >> 10 );
					var lo = 0xDC00 + ( ( range - 0x10000 ) & 0x3FF );
					/* eslint-enable no-bitwise */
 
					disjunction.push( uEsc( hi ) + uEsc( lo ) );
					continue;
				}
			}
 
			// Handle interval
			var min = range[ 0 ];
			var max = range[ 1 ];
			if ( min > max ) {
				throw new Error( min.toString( 16 ) + ' > ' + max.toString( 16 ) );
			}
			if ( max > 0x10FFFF ) {
				throw new Error( 'Character code too high: ' +
					max.toString( 16 ) );
			}
			if ( max >= 0xD800 && min <= 0xDFFF ) {
				throw new Error( 'range includes surrogates: ' +
					min.toString( 16 ) + '-' + max.toString( 16 ) );
			}
			if ( max <= 0xFFFF ) {
				// interval is entirely BMP
				characterClass.push( codeUnitRange( min, max ) );
			} else if ( min <= 0xFFFF ) {
				// interval is BMP and non-BMP
				characterClass.push( codeUnitRange( min, 0xFFFF ) );
				boxes = getCodeUnitBoxes( 0x10000, max );
			} else {
				// interval is entirely non-BMP
				boxes = getCodeUnitBoxes( min, max );
			}
 
			// append hi-lo surrogate space boxes as code unit range pairs
			for ( var j = 0; j < boxes.length; j++ ) {
				var box = boxes[ j ];
				var hi2 = codeUnitRange( box.hi[ 0 ], box.hi[ 1 ], true );
				var lo2 = codeUnitRange( box.lo[ 0 ], box.lo[ 1 ], true );
				disjunction.push( hi2 + lo2 );
			}
		}
 
		// prepend BMP character class to the disjunction
		if ( characterClass.length === 1 && !characterClass[ 0 ].match( /-/ ) ) {
			disjunction.unshift( characterClass[ 0 ] ); // single character
		} else if ( characterClass.length > 0 ) {
			disjunction.unshift( '[' + characterClass.join( '' ) + ']' );
		}
		return disjunction.join( '|' );
	};
}() );