Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
90.48% |
38 / 42 |
|
50.00% |
1 / 2 |
CRAP | |
0.00% |
0 / 1 |
| RegexFilter | |
90.48% |
38 / 42 |
|
50.00% |
1 / 2 |
10.09 | |
0.00% |
0 / 1 |
| process | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| processRules | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
| processRule | |
90.00% |
36 / 40 |
|
0.00% |
0 / 1 |
9.08 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MediaWiki\Wikispeech\Segment\TextFilter; |
| 4 | |
| 5 | /** |
| 6 | * @file |
| 7 | * @ingroup Extensions |
| 8 | * @license GPL-2.0-or-later |
| 9 | */ |
| 10 | |
| 11 | use LogicException; |
| 12 | |
| 13 | /** |
| 14 | * Transforms texts using regular expressions. |
| 15 | * |
| 16 | * Iterates available {@link FilterPart}s and execute {@link RegexFilterRule}s |
| 17 | * according to some logical rules as defined by the implementation. |
| 18 | * |
| 19 | * If a {@link RegexFilterRule} is matching a {@link FilterPart::$text}, |
| 20 | * it will potentially split prefix and suffix text in new {@link FilterPart}s |
| 21 | * that will be further processed. |
| 22 | * |
| 23 | * Using regular expressions like this is very limited and will often require |
| 24 | * them to be executed in a particular order. It is almost always better to |
| 25 | * create and parse an abstract syntax tree, but simple regular expressions are, |
| 26 | * when applicable, easier and faster to work with as a developer. |
| 27 | * |
| 28 | * Consider the following text: |
| 29 | * 'He turned 32 on dec 30, 1321' |
| 30 | * |
| 31 | * If you execute the rule for numbers before the date rule, you'll end up with: |
| 32 | * 'He turned thirty two on dec thirty, one thousand three hundred and twenty one'. |
| 33 | * |
| 34 | * Instead, if you execute the rule for date before the rule for numbers, you'll end up with: |
| 35 | * 'He turned 32 on december thirty, thirteen hundred and twenty one'. |
| 36 | * |
| 37 | * The more regular expression based rules you have, the more complex this will become. |
| 38 | * |
| 39 | * @since 0.1.10 |
| 40 | */ |
| 41 | abstract class RegexFilter extends Filter { |
| 42 | |
| 43 | /** |
| 44 | * @since 0.1.10 |
| 45 | * @return string|null text/xml SSML output, or null if no rules applied |
| 46 | */ |
| 47 | public function process(): ?string { |
| 48 | $this->processRules(); |
| 49 | return parent::process(); |
| 50 | } |
| 51 | |
| 52 | /** |
| 53 | * @since 0.1.10 |
| 54 | */ |
| 55 | abstract public function processRules(): void; |
| 56 | |
| 57 | /** |
| 58 | * @since 0.1.10 |
| 59 | * @param RegexFilterRule $rule |
| 60 | * @throws LogicException If expression is invalid. |
| 61 | */ |
| 62 | public function processRule( |
| 63 | RegexFilterRule $rule |
| 64 | ): void { |
| 65 | $hasChanges = true; |
| 66 | while ( $hasChanges ) { |
| 67 | $hasChanges = false; |
| 68 | foreach ( $this->getParts() as $partIndex => $part ) { |
| 69 | if ( $part->getAppliedRule() !== null ) { |
| 70 | // Don't attempt to apply rules to a part which is the result of a previously invoked rule. |
| 71 | continue; |
| 72 | } |
| 73 | $matches = []; |
| 74 | $preg_matched = preg_match( |
| 75 | $rule->getExpression(), |
| 76 | $part->getText(), |
| 77 | $matches, |
| 78 | PREG_OFFSET_CAPTURE |
| 79 | ); |
| 80 | |
| 81 | if ( $preg_matched === false ) { |
| 82 | throw new LogicException( |
| 83 | "Bad expression '{$rule->getExpression()}' on text '{$part->getText()}'." |
| 84 | ); |
| 85 | } elseif ( $preg_matched !== 1 ) { |
| 86 | // regular expression of rule does not match |
| 87 | continue; |
| 88 | } |
| 89 | |
| 90 | $alias = $rule->createAlias( $matches ); |
| 91 | if ( $alias === null ) { |
| 92 | // The regular expression of the rule matched, |
| 93 | // but due to logic of the rule it did not produce an alias. |
| 94 | continue; |
| 95 | } |
| 96 | $hasChanges = true; |
| 97 | |
| 98 | // Find out if there is any text before or after the matching group. |
| 99 | // If so, then cut this out and add as new parts that might be processed |
| 100 | // by other rules. |
| 101 | |
| 102 | // Matches contains start offset that are at byte level, |
| 103 | // therefore we have to use strlen rather than mb_strlen below. |
| 104 | |
| 105 | /** @var int $startOffset */ |
| 106 | $startOffset = $matches[$rule->getMainGroup()][1]; |
| 107 | /** @var int $endOffset */ |
| 108 | $endOffset = $startOffset + strlen( $matches[$rule->getMainGroup()][0] ); |
| 109 | if ( $startOffset > 0 ) { |
| 110 | $prefixPart = new FilterPart( |
| 111 | substr( $part->getText(), 0, $startOffset ) |
| 112 | ); |
| 113 | $this->insertPart( $partIndex, $prefixPart ); |
| 114 | $partIndex++; |
| 115 | } |
| 116 | if ( $endOffset < strlen( $part->getText() ) ) { |
| 117 | $suffixPart = new FilterPart( |
| 118 | substr( $part->getText(), $endOffset ) |
| 119 | ); |
| 120 | $this->insertPart( $partIndex + 1, $suffixPart ); |
| 121 | } |
| 122 | |
| 123 | // Update part to contain only transformed information |
| 124 | $part->setText( $matches[$rule->getMainGroup()][0] ); |
| 125 | $part->setAppliedRule( $rule ); |
| 126 | $part->setAlias( $alias ); |
| 127 | break; |
| 128 | } |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | } |