Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
90.48% |
38 / 42 |
|
50.00% |
1 / 2 |
CRAP | |
0.00% |
0 / 1 |
RegexFilter | |
90.48% |
38 / 42 |
|
50.00% |
1 / 2 |
10.09 | |
0.00% |
0 / 1 |
process | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
processRules | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
processRule | |
90.00% |
36 / 40 |
|
0.00% |
0 / 1 |
9.08 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Wikispeech\Segment\TextFilter; |
4 | |
5 | /** |
6 | * @file |
7 | * @ingroup Extensions |
8 | * @license GPL-2.0-or-later |
9 | */ |
10 | |
11 | use MWException; |
12 | |
13 | /** |
14 | * Transforms texts using regular expressions. |
15 | * |
16 | * Iterates available {@link FilterPart}s and execute {@link RegexFilterRule}s |
17 | * according to some logical rules as defined by the implementation. |
18 | * |
19 | * If a {@link RegexFilterRule} is matching a {@link FilterPart::$text}, |
20 | * it will potentially split prefix and suffix text in new {@link FilterPart}s |
21 | * that will be further processed. |
22 | * |
23 | * Using regular expressions like this is very limited and will often require |
24 | * them to be executed in a particular order. It is almost always better to |
25 | * create and parse an abstract syntax tree, but simple regular expressions are, |
26 | * when applicable, easier and faster to work with as a developer. |
27 | * |
28 | * Consider the following text: |
29 | * 'He turned 32 on dec 30, 1321' |
30 | * |
31 | * If you execute the rule for numbers before the date rule, you'll end up with: |
32 | * 'He turned thirty two on dec thirty, one thousand three hundred and twenty one'. |
33 | * |
34 | * Instead, if you execute the rule for date before the rule for numbers, you'll end up with: |
35 | * 'He turned 32 on december thirty, thirteen hundred and twenty one'. |
36 | * |
37 | * The more regular expression based rules you have, the more complex this will become. |
38 | * |
39 | * @since 0.1.10 |
40 | */ |
41 | abstract class RegexFilter extends Filter { |
42 | |
43 | /** |
44 | * @since 0.1.10 |
45 | * @return string|null text/xml SSML output, or null if no rules applied |
46 | */ |
47 | public function process(): ?string { |
48 | $this->processRules(); |
49 | return parent::process(); |
50 | } |
51 | |
52 | /** |
53 | * @since 0.1.10 |
54 | */ |
55 | abstract public function processRules(): void; |
56 | |
57 | /** |
58 | * @since 0.1.10 |
59 | * @param RegexFilterRule $rule |
60 | * @throws MWException If expression is invalid. |
61 | */ |
62 | public function processRule( |
63 | RegexFilterRule $rule |
64 | ): void { |
65 | $hasChanges = true; |
66 | while ( $hasChanges ) { |
67 | $hasChanges = false; |
68 | foreach ( $this->getParts() as $partIndex => $part ) { |
69 | if ( $part->getAppliedRule() !== null ) { |
70 | // Don't attempt to apply rules to a part which is the result of a previously invoked rule. |
71 | continue; |
72 | } |
73 | $matches = []; |
74 | $preg_matched = preg_match( |
75 | $rule->getExpression(), |
76 | $part->getText(), |
77 | $matches, |
78 | PREG_OFFSET_CAPTURE |
79 | ); |
80 | |
81 | if ( $preg_matched === false ) { |
82 | throw new MWException( |
83 | "Bad expression '{$rule->getExpression()}' on text '{$part->getText()}'." |
84 | ); |
85 | } elseif ( $preg_matched !== 1 ) { |
86 | // regular expression of rule does not match |
87 | continue; |
88 | } |
89 | |
90 | $alias = $rule->createAlias( $matches ); |
91 | if ( $alias === null ) { |
92 | // The regular expression of the rule matched, |
93 | // but due to logic of the rule it did not produce an alias. |
94 | continue; |
95 | } |
96 | $hasChanges = true; |
97 | |
98 | // Find out if there is any text before or after the matching group. |
99 | // If so, then cut this out and add as new parts that might be processed |
100 | // by other rules. |
101 | |
102 | // Matches contains start offset that are at byte level, |
103 | // therefore we have to use strlen rather than mb_strlen below. |
104 | |
105 | /** @var int $startOffset */ |
106 | $startOffset = $matches[$rule->getMainGroup()][1]; |
107 | /** @var int $endOffset */ |
108 | $endOffset = $startOffset + strlen( $matches[$rule->getMainGroup()][0] ); |
109 | if ( $startOffset > 0 ) { |
110 | $prefixPart = new FilterPart( |
111 | substr( $part->getText(), 0, $startOffset ) |
112 | ); |
113 | $this->insertPart( $partIndex, $prefixPart ); |
114 | $partIndex++; |
115 | } |
116 | if ( $endOffset < strlen( $part->getText() ) ) { |
117 | $suffixPart = new FilterPart( |
118 | substr( $part->getText(), $endOffset ) |
119 | ); |
120 | $this->insertPart( $partIndex + 1, $suffixPart ); |
121 | } |
122 | |
123 | // Update part to contain only transformed information |
124 | $part->setText( $matches[$rule->getMainGroup()][0] ); |
125 | $part->setAppliedRule( $rule ); |
126 | $part->setAlias( $alias ); |
127 | break; |
128 | } |
129 | } |
130 | } |
131 | |
132 | } |