/**
* Finder of text content within lines of text
*
* @constructor
* @param {Object} [options] Search options
* @param {boolean} [options.wholeWord] Only match whole-word occurrences
*/
ve.dm.TextFinder = function VeDmTextFinder( options ) {
this.options = options;
};
OO.initClass( ve.dm.TextFinder );
/**
* @param {string} line The line of text to match
* @return {Array[]} Match ranges [ [ start1, end1 ], ... ]
*/
ve.dm.TextFinder.prototype.find = function ( line ) {
let ranges = this.findRanges( line );
if ( this.options.wholeWord ) {
const dataString = new ve.dm.DataString( [ ...line ] );
ranges = ranges.filter( ( range ) => unicodeJS.wordbreak.isBreak( dataString, range[ 0 ] ) &&
unicodeJS.wordbreak.isBreak( dataString, range[ 1 ] ) );
}
return ranges;
};
/**
* Find the matching ranges, without checking for word boundaries
*
* @param {string} line The line of text to match
* @return {Array[]} Match ranges [ [ start1, end1 ], ... ]
*/
ve.dm.TextFinder.prototype.findRanges = null;
/**
* @constructor
* @param {string} query Text to find
* @param {Object} [options] Search options
* @param {boolean} [options.lang='unk'] Language for Intl.Collator#compare
* @param {boolean} [options.caseSensitiveString] Case sensitive search for a string query
* @param {boolean} [options.diacriticInsensitiveString] Diacritic insensitive search for a string query
* Only works in browsers which support the Internationalization API
* @param {boolean} [options.noOverlaps] Avoid overlapping matches
* @return {ve.Range[]} List of ranges where the string was found
*/
ve.dm.StringTextFinder = function VeDmStringTextFinder( query, options ) {
ve.dm.StringTextFinder.super.call( this );
this.query = query;
this.options = options;
const lang = options.lang || 'unk';
let sensitivity;
if ( this.options.diacriticInsensitiveString ) {
sensitivity = this.options.caseSensitiveString ? 'case' : 'base';
} else {
sensitivity = this.options.caseSensitiveString ? 'variant' : 'accent';
}
// Intl is only used browser clients
this.compare = new Intl.Collator( lang, { sensitivity } ).compare;
};
OO.inheritClass( ve.dm.StringTextFinder, ve.dm.TextFinder );
/**
* @inheritdoc
*/
ve.dm.StringTextFinder.prototype.findRanges = function ( line ) {
// FIXME: For case-insensitive search, character-wise comparison is not i18n-safe. and
// it cannot be assumed that the match will have the same string length as the search
// query. E.g. the following compares equal:
// collator = new Intl.Collator( 'en', { sensitivity: 'base' } );
// collator.compare( '\u0130', '\u0069\u0307' );
const qLen = this.query.length;
const ranges = [];
// Iterate up to (and including) offset textLength - queryLength. Beyond that point
// there is not enough room for the query to exist
for ( let offset = 0, l = line.length - qLen; offset <= l; offset++ ) {
let j = 0;
while ( this.compare( line[ offset + j ], this.query[ j ] ) === 0 ) {
j++;
if ( j === qLen ) {
ranges.push( [ offset, offset + qLen ] );
offset += this.options.noOverlaps ? qLen - 1 : 0;
break;
}
}
}
return ranges;
};
/**
* @constructor
* @param {Set} query The set of words to match
* @param {Object} [options] Search options
* @param {boolean} [options.caseSensitiveString] Should the search be case sensitive.
* @param {boolean} [options.lang] Language for case folding. Required if !caseSensitiveString.
* @param {boolean} [options.noOverlaps] Avoid overlapping matches
*/
ve.dm.SetTextFinder = function VeDmSetTextFinder( query, options = {} ) {
ve.dm.SetTextFinder.super.call( this );
this.options = options;
this.lang = options.lang;
if ( options.caseSensitiveString ) {
this.normalizedQuery = query;
} else {
this.normalizedQuery = new Set( Array.from( query ).map( ( s ) => s.toLocaleLowerCase( this.lang ) ) );
}
this.minLen = Infinity;
this.maxLen = 0;
this.normalizedQuery.forEach( ( s ) => {
this.minLen = Math.min( this.minLen, s.length );
this.maxLen = Math.max( this.maxLen, s.length );
} );
};
OO.inheritClass( ve.dm.SetTextFinder, ve.dm.TextFinder );
/**
* Map from offset in case-folded string to offset in original string
* In some cases, case-folding can change string length
* For example, if s = '\u0130', then s.length === 1 but s.toLocaleLowerCase( 'en' ).length === 2
*
* @param {string} s
* @param {number} offsetLower in lowercased string
* @return {number} corresponding offset in original string
*/
ve.dm.SetTextFinder.static.fixOffset = function ( s, offsetLower ) {
// Start by guessing that lowercasing didn't change the offset,
// except when the offset is out of bounds in the original string
let guess = Math.min( offsetLower, s.length );
let diff = s.slice( 0, guess ).toLocaleLowerCase( this.lang ).length - offsetLower;
if ( diff === 0 ) {
// Optimization note: this will almost always be true
// Only rare characters change length of substr when case folding
return guess;
}
while ( diff > 0 ) {
// The lowercase substr is longer than original
guess--;
diff = s.slice( 0, guess ).toLocaleLowerCase( this.lang ).length - offsetLower;
}
while ( diff < 0 ) {
// The lowercase substr is shorter than original
guess++;
diff = s.slice( 0, guess ).toLocaleLowerCase( this.lang ).length - offsetLower;
}
// In some rare situations the diff might be positive now
// (which would correspond to no offset in the original string mapping to the desired offset)
return guess;
};
/**
* @inheritdoc
*/
ve.dm.SetTextFinder.prototype.findRanges = function ( line ) {
if ( this.normalizedQuery.size === 0 ) {
return [];
}
let normalizedLine = line;
if ( !this.options.caseSensitiveString ) {
normalizedLine = line.toLocaleLowerCase( this.lang );
}
const ranges = [];
// For each possible length, do a sliding window search on the normalized line
for ( let len = this.minLen; len <= this.maxLen; len++ ) {
for ( let i = 0; i <= normalizedLine.length - len; i++ ) {
const substr = normalizedLine.slice( i, i + len );
if ( this.normalizedQuery.has( substr ) ) {
let start = i;
let end = i + len;
if ( !this.options.caseSensitiveString ) {
start = this.constructor.static.fixOffset( line, start );
end = this.constructor.static.fixOffset( line, end );
}
ranges.push( [ start, end ] );
if ( this.options.noOverlaps ) {
i += len - 1;
}
}
}
}
return ranges;
};
/**
* @constructor
* @param {RegExp} query RegExp object with the /g flag
* @param {Object} [options] Search options
* @param {boolean} [options.noOverlaps] Avoid overlapping matches
*/
ve.dm.RegExpTextFinder = function VeDmRegExpTextFinder( query, options = {} ) {
ve.dm.RegExpTextFinder.super.call( this );
if ( !query.global ) {
throw new Error( 'The /g flag must be set on the query RegExp' );
}
this.query = query;
this.options = options;
};
OO.inheritClass( ve.dm.RegExpTextFinder, ve.dm.TextFinder );
/**
* @inheritdoc
*/
ve.dm.RegExpTextFinder.prototype.findRanges = function ( line ) {
this.query.lastIndex = 0;
let match;
const ranges = [];
while ( ( match = this.query.exec( line ) ) !== null ) {
const matchText = match[ 0 ];
// Skip empty string matches (e.g. with .*)
if ( matchText.length === 0 ) {
// Set lastIndex to the next character to avoid an infinite
// loop. Browsers differ in whether they do this for you
// for empty matches; see
// http://blog.stevenlevithan.com/archives/exec-bugs
this.query.lastIndex = match.index + 1;
continue;
}
ranges.push( [ match.index, match.index + matchText.length ] );
if ( !this.options.noOverlaps || matchText[ 0 ] === '\uFFFC' ) {
// We want overlaps, or the match starts with OBJECT REPLACEMENT CHARACTER,
// so jump back to the next character after the start of the match
this.query.lastIndex = match.index + 1;
}
}
return ranges;
};
/**
* Wrap a finder in a memoizing layer
*
* @constructor
* @param {ve.dm.TextFinder} finder The finder to wrap
* @param {number} [capacity=1000] Max number of results to remember
*/
ve.dm.MemoizedTextFinder = function VeDmMemoizedTextFinder( finder, capacity = 1000 ) {
this.finder = finder;
this.memory = new Map();
this.capacity = capacity;
};
OO.inheritClass( ve.dm.MemoizedTextFinder, ve.dm.TextFinder );
/**
* @inheritdoc
*/
ve.dm.MemoizedTextFinder.prototype.find = function ( line ) {
let result = this.memory.get( line );
if ( result === undefined ) {
result = this.finder.find( line );
} else {
// We'll refresh the key for LRU purposes
this.memory.delete( result );
}
this.memory.set( line, result );
if ( this.memory.size > this.capacity ) {
// Purge stalest key
this.memory.delete( this.memory.keys().next().value );
}
return result;
};