All files unicodejs.wordbreak.js

100% Statements 95/95
100% Branches 140/140
100% Functions 6/6
100% Lines 95/95
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322  
 
 
 
 
 
 
 
 
 
 
 
1x
1x
1x
 
 
 
1x
1x
1x
 
 
 
1x
 
18x
 
 
 
1x
 
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13287x
137703x
13086x
 
 
201x
 
 
 
 
 
 
 
 
 
 
 
1x
17x
 
 
 
 
 
 
 
 
 
 
 
1x
17x
 
 
 
 
 
 
 
 
 
 
 
 
1x
 
34x
34x
 
 
 
34x
152x
152x
 
64x
50x
50x
 
 
 
34x
 
 
30x
 
 
34x
 
 
 
 
 
 
 
 
 
 
 
 
1x
8301x
8301x
8301x
8301x
 
 
 
 
 
 
 
8301x
8301x
 
 
 
 
8301x
3650x
 
 
 
4651x
150x
 
 
 
4501x
4501x
4501x
4501x
 
4501x
 
 
 
2x
 
 
 
 
 
 
341x
 
 
 
9x
 
 
 
10x
 
 
 
 
 
4139x
 
 
1286x
 
 
2853x
950x
 
137x
 
813x
 
 
 
 
 
 
813x
813x
 
 
 
 
2716x
 
 
 
109x
 
 
 
 
2607x
3164x
3164x
1356x
1356x
 
1808x
1808x
 
2607x
 
 
2607x
2937x
2937x
1356x
1356x
 
1581x
1581x
 
2607x
 
2607x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156x
 
 
 
 
 
 
 
 
66x
 
 
 
 
7x
 
 
 
 
 
 
 
 
190x
 
 
 
 
 
2188x
 
13x
13x
 
13x
37x
37x
4x
 
33x
33x
33x
20x
 
 
13x
8x
 
 
 
 
 
2180x
 
 
  /*!
 * UnicodeJS Word Break module
 *
 * Implementation of Unicode 15.0.0 Default Word Boundary Specification
 * http://www.unicode.org/reports/tr29/#Default_Word_Boundaries
 *
 * @copyright 2013-2018 UnicodeJS team and others; see AUTHORS.txt
 * @license The MIT License (MIT); see LICENSE.txt
 */
 
/* eslint-disable no-fallthrough */
 
( function () {
	var properties = unicodeJS.wordbreakproperties,
		emojiProperties = unicodeJS.emojiproperties,
		/**
		 * @namespace unicodeJS.wordbreak
		 */
		wordbreak = unicodeJS.wordbreak = {},
		patterns = {},
		ZWJ_FE = /^(Format|Extend|ZWJ)$/;
 
	var property;
	// build regexes
	for ( property in properties ) {
		// eslint-disable-next-line security/detect-non-literal-regexp
		patterns[ property ] = new RegExp(
			unicodeJS.charRangeArrayRegexp( properties[ property ] )
		);
	}
	for ( property in emojiProperties ) {
		// eslint-disable-next-line security/detect-non-literal-regexp
		patterns[ property ] = new RegExp(
			unicodeJS.charRangeArrayRegexp( emojiProperties[ property ] )
		);
	}
 
	/**
	 * Return the wordbreak property value for the codepoint
	 *
	 * See http://www.unicode.org/reports/tr29/#Word_Boundaries
	 *
	 * @memberof unicodeJS.wordbreak
	 * @private
	 * @param {string} codepoint The codepoint
	 * @return {string|null} The unicode wordbreak property value (key of unicodeJS.wordbreakproperties)
	 */
	function getProperty( codepoint ) {
		for ( property in patterns ) {
			if ( patterns[ property ].test( codepoint ) ) {
				return property;
			}
		}
		return null;
	}
 
	/**
	 * Find the next word break offset.
	 *
	 * @memberof unicodeJS.wordbreak
	 * @param {unicodeJS.TextString} string TextString
	 * @param {number} pos Character position
	 * @param {boolean} [onlyAlphaNumeric=false] When set, ignores a break if the previous character is not alphaNumeric
	 * @return {number} Returns the next offset which is a word break
	 */
	wordbreak.nextBreakOffset = function ( string, pos, onlyAlphaNumeric ) {
		return this.moveBreakOffset( 1, string, pos, onlyAlphaNumeric );
	};
 
	/**
	 * Find the previous word break offset.
	 *
	 * @memberof unicodeJS.wordbreak
	 * @param {unicodeJS.TextString} string TextString
	 * @param {number} pos Character position
	 * @param {boolean} [onlyAlphaNumeric=false] When set, ignores a break if the previous character is not alphaNumeric
	 * @return {number} Returns the previous offset which is a word break
	 */
	wordbreak.prevBreakOffset = function ( string, pos, onlyAlphaNumeric ) {
		return this.moveBreakOffset( -1, string, pos, onlyAlphaNumeric );
	};
 
	/**
	 * Find the next word break offset in a specified direction.
	 *
	 * @memberof unicodeJS.wordbreak
	 * @param {number} direction Direction to search in, should be plus or minus one
	 * @param {unicodeJS.TextString} string TextString
	 * @param {number} pos Character position
	 * @param {boolean} [onlyAlphaNumeric=false] When set, ignores a break if the previous character is not alphaNumeric
	 * @return {number} Returns the previous offset which is word break
	 */
	wordbreak.moveBreakOffset = function ( direction, string, pos, onlyAlphaNumeric ) {
		// when moving backwards, use the character to the left of the cursor
		var nextCodepoint = direction > 0 ? string.nextCodepoint.bind( string ) : string.prevCodepoint.bind( string ),
			prevCodepoint = direction > 0 ? string.prevCodepoint.bind( string ) : string.nextCodepoint.bind( string );
 
		var codepoint;
		// Search for the next break point
		while ( ( codepoint = nextCodepoint( pos ) ) !== null ) {
			pos += codepoint.length * direction;
			if ( this.isBreak( string, pos ) ) {
				// Check previous character was alpha-numeric if required
				if ( onlyAlphaNumeric ) {
					var lastProperty = getProperty( prevCodepoint( pos ) );
					if ( lastProperty !== 'ALetter' &&
						lastProperty !== 'Numeric' &&
						lastProperty !== 'Katakana' &&
						lastProperty !== 'HebrewLetter' ) {
						continue;
					}
				}
				break;
			}
		}
		return pos;
	};
 
	/**
	 * Evaluates whether a position within some text is a word boundary.
	 *
	 * The text object elements may be codepoints or code units
	 *
	 * @memberof unicodeJS.wordbreak
	 * @param {unicodeJS.TextString} string TextString
	 * @param {number} pos Character position
	 * @return {boolean} Is the position a word boundary
	 */
	wordbreak.isBreak = function ( string, pos ) {
		var lft = [],
			rgt = [],
			l = 0,
			r = 0;
 
		// Table 3a. Word_Break Rule Macros
		// Macro        Represents
		// AHLetter     (ALetter | Hebrew_Letter)
		// MidNumLetQ   (MidNumLet | Single_Quote)
 
		// Get some context
		var nextCodepoint = string.nextCodepoint( pos + r );
		var prevCodepoint = string.prevCodepoint( pos - l );
 
		// Break at the start and end of text, unless the text is empty.
		// WB1: sot ÷ Any
		// WB2: Any ÷ eot
		if ( nextCodepoint === null || prevCodepoint === null ) {
			return true;
		}
 
		// Do not break inside surrogate pair
		if ( string.isMidSurrogate( pos ) ) {
			return false;
		}
 
		// Store context
		rgt.push( getProperty( nextCodepoint ) );
		lft.push( getProperty( prevCodepoint ) );
		r += nextCodepoint.length;
		l += prevCodepoint.length;
 
		switch ( true ) {
			// Do not break within CRLF.
			// WB3: CR × LF
			case lft[ 0 ] === 'CR' && rgt[ 0 ] === 'LF':
				return false;
 
			// Otherwise break before and after Newlines (including CR and LF)
			// WB3a: (Newline | CR | LF) ÷
			case lft[ 0 ] === 'Newline' || lft[ 0 ] === 'CR' || lft[ 0 ] === 'LF':
			// WB3b: ÷ (Newline | CR | LF)
			case rgt[ 0 ] === 'Newline' || rgt[ 0 ] === 'CR' || rgt[ 0 ] === 'LF':
				return true;
			// Do not break within emoji zwj sequences.
			// WB3c: ZWJ × \p{Extended_Pictographic}
			case lft[ 0 ] === 'ZWJ' && rgt[ 0 ] === 'ExtendedPictographic':
				return false;
			// Do not break within emoji zwj sequences.
			// WB3d: Keep horizontal whitespace together.
			case lft[ 0 ] === 'WSegSpace' && rgt[ 0 ] === 'WSegSpace':
				return false;
		}
 
		// Ignore Format and Extend characters, except after sot, CR, LF, and Newline.
		// (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend | ZWJ)
		// WB4: X (Extend | Format | ZWJ)* → X
		if ( rgt[ 0 ] && rgt[ 0 ].match( ZWJ_FE ) ) {
			// The Extend|Format character is to the right, so it is attached
			// to a character to the left, don't split here
			return false;
		}
		// We've reached the end of an ZWJ_FE sequence, collapse it
		while ( lft[ 0 ] && lft[ 0 ].match( ZWJ_FE ) ) {
			if ( pos - l <= 0 ) {
				// start of document
				return true;
			}
			prevCodepoint = string.prevCodepoint( pos - l );
			// TODO: This is not covered by tests, see T264904
			// istanbul ignore next
			if ( prevCodepoint === null ) {
				// start of document?
				return true;
			}
			lft[ 0 ] = getProperty( prevCodepoint );
			l += prevCodepoint.length;
		}
 
		// Do not break between most letters.
		// WB5: AHLetter × AHLetter
		if (
			( lft[ 0 ] === 'ALetter' || lft[ 0 ] === 'HebrewLetter' ) &&
			( rgt[ 0 ] === 'ALetter' || rgt[ 0 ] === 'HebrewLetter' )
		) {
			return false;
		}
 
		var nextProperty;
		// Some tests beyond this point require more context, as per WB4 ignore ZWJ_FE.
		do {
			nextCodepoint = string.nextCodepoint( pos + r );
			if ( nextCodepoint === null ) {
				nextProperty = null;
				break;
			}
			r += nextCodepoint.length;
			nextProperty = getProperty( nextCodepoint );
		} while ( nextProperty && nextProperty.match( ZWJ_FE ) );
		rgt.push( nextProperty );
 
		var prevProperty;
		do {
			prevCodepoint = string.prevCodepoint( pos - l );
			if ( prevCodepoint === null ) {
				prevProperty = null;
				break;
			}
			l += prevCodepoint.length;
			prevProperty = getProperty( prevCodepoint );
		} while ( prevProperty && prevProperty.match( ZWJ_FE ) );
		lft.push( prevProperty );
 
		switch ( true ) {
			// Do not break letters across certain punctuation.
			// WB6: AHLetter × (MidLetter | MidNumLetQ) AHLetter
			case ( lft[ 0 ] === 'ALetter' || lft[ 0 ] === 'HebrewLetter' ) &&
				( rgt[ 1 ] === 'ALetter' || rgt[ 1 ] === 'HebrewLetter' ) &&
				( rgt[ 0 ] === 'MidLetter' || rgt[ 0 ] === 'MidNumLet' || rgt[ 0 ] === 'SingleQuote' ):
			// WB7: AHLetter (MidLetter | MidNumLetQ) × AHLetter
			case ( rgt[ 0 ] === 'ALetter' || rgt[ 0 ] === 'HebrewLetter' ) &&
				( lft[ 1 ] === 'ALetter' || lft[ 1 ] === 'HebrewLetter' ) &&
				( lft[ 0 ] === 'MidLetter' || lft[ 0 ] === 'MidNumLet' || lft[ 0 ] === 'SingleQuote' ):
			// WB7a: Hebrew_Letter × Single_Quote
			case lft[ 0 ] === 'HebrewLetter' && rgt[ 0 ] === 'SingleQuote':
			// WB7b: Hebrew_Letter × Double_Quote Hebrew_Letter
			case lft[ 0 ] === 'HebrewLetter' && rgt[ 0 ] === 'DoubleQuote' && rgt[ 1 ] === 'HebrewLetter':
			// WB7c: Hebrew_Letter Double_Quote × Hebrew_Letter
			case lft[ 1 ] === 'HebrewLetter' && lft[ 0 ] === 'DoubleQuote' && rgt[ 0 ] === 'HebrewLetter':
 
			// Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”).
			// WB8: Numeric × Numeric
			case lft[ 0 ] === 'Numeric' && rgt[ 0 ] === 'Numeric':
			// WB9: AHLetter × Numeric
			case ( lft[ 0 ] === 'ALetter' || lft[ 0 ] === 'HebrewLetter' ) && rgt[ 0 ] === 'Numeric':
			// WB10: Numeric × AHLetter
			case lft[ 0 ] === 'Numeric' && ( rgt[ 0 ] === 'ALetter' || rgt[ 0 ] === 'HebrewLetter' ):
				return false;
 
			// Do not break within sequences, such as “3.2” or “3,456.789”.
			// WB11: Numeric (MidNum | MidNumLetQ) × Numeric
			case rgt[ 0 ] === 'Numeric' && lft[ 1 ] === 'Numeric' &&
				( lft[ 0 ] === 'MidNum' || lft[ 0 ] === 'MidNumLet' || lft[ 0 ] === 'SingleQuote' ):
			// WB12: Numeric × (MidNum | MidNumLetQ) Numeric
			case lft[ 0 ] === 'Numeric' && rgt[ 1 ] === 'Numeric' &&
				( rgt[ 0 ] === 'MidNum' || rgt[ 0 ] === 'MidNumLet' || rgt[ 0 ] === 'SingleQuote' ):
				return false;
 
			// Do not break between Katakana.
			// WB13: Katakana × Katakana
			case lft[ 0 ] === 'Katakana' && rgt[ 0 ] === 'Katakana':
				return false;
 
			// Do not break from extenders.
			// WB13a: (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
			case rgt[ 0 ] === 'ExtendNumLet' &&
				( lft[ 0 ] === 'ALetter' || lft[ 0 ] === 'HebrewLetter' || lft[ 0 ] === 'Numeric' || lft[ 0 ] === 'Katakana' || lft[ 0 ] === 'ExtendNumLet' ):
			// WB13b: ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
			case lft[ 0 ] === 'ExtendNumLet' &&
				( rgt[ 0 ] === 'ALetter' || rgt[ 0 ] === 'HebrewLetter' || rgt[ 0 ] === 'Numeric' || rgt[ 0 ] === 'Katakana' ):
				return false;
		}
 
		// Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.
		// WB15: ^ (RI RI)* RI × RI
		// WB16: [^RI] (RI RI)* RI × RI
		if ( lft[ 0 ] === 'RegionalIndicator' && rgt[ 0 ] === 'RegionalIndicator' ) {
			// Count RIs on the left
			var regional = 0;
			var n = 0;
 
			do {
				prevCodepoint = string.prevCodepoint( pos - n );
				if ( prevCodepoint === null ) {
					break;
				}
				n += prevCodepoint.length;
				prevProperty = getProperty( prevCodepoint );
				if ( prevProperty === 'RegionalIndicator' ) {
					regional++;
				}
			} while ( prevProperty === 'RegionalIndicator' || ( prevProperty && prevProperty.match( ZWJ_FE ) ) );
			if ( regional % 2 === 1 ) {
				return false;
			}
 
		}
		// Otherwise, break everywhere (including around ideographs).
		// WB999: Any ÷ Any
		return true;
	};
}() );