Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 282
0.00% covered (danger)
0.00%
0 / 6
CRAP
0.00% covered (danger)
0.00%
0 / 1
MhchemPatterns
0.00% covered (danger)
0.00%
0 / 282
0.00% covered (danger)
0.00%
0 / 6
1806
0.00% covered (danger)
0.00%
0 / 1
 getPatterns
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 findObserveGroups
0.00% covered (danger)
0.00%
0 / 28
0.00% covered (danger)
0.00%
0 / 1
132
 matchObsGrInner
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
30
 findObserveGroupsInner
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
56
 __construct
0.00% covered (danger)
0.00%
0 / 212
0.00% covered (danger)
0.00%
0 / 1
132
 match
0.00% covered (danger)
0.00%
0 / 20
0.00% covered (danger)
0.00%
0 / 1
56
1<?php
2/**
3 * Copyright (c) 2023 Johannes Stegmüller
4 *
5 * This file is a port of mhchemParser originally authored by Martin Hensel in javascript/typescript.
6 * The original license for this software can be found in the accompanying LICENSE.mhchemParser-ts.txt file.
7 */
8
9declare( strict_types = 1 );
10
11namespace MediaWiki\Extension\Math\WikiTexVC\Mhchem;
12
13use MediaWiki\Extension\Math\WikiTexVC\Mhchem\MhchemRegExp as Reg;
14use RuntimeException;
15
16/**
17 * Contains all matching regex patterns and match functions for mhchemParser in PHP.
18 *
19 * corresponds mostly to the 'patterns' array in line ~207 in mhchemParser.js by Martin Hensel
20 *
21 * @author Johannes Stegmüller
22 * @license GPL-2.0-or-later
23 */
24class MhchemPatterns {
25
26    /** @var array */
27    private array $patterns;
28
29    /**
30     * Matching patterns
31     * either regexes or function that return null or {match_:"a", remainder:"bc"}
32     * @return array
33     */
34    public function getPatterns(): array {
35        return $this->patterns;
36    }
37
38    public function findObserveGroups( $input, $begExcl, $begIncl, $endIncl,
39                                $endExcl = null, $beg2Excl = null, $beg2Incl = null,
40                                $end2Incl = null, $end2Excl = null, $combine = null ): ?array {
41        $match = $this->matchObsGrInner( $input, $begExcl );
42        if ( $match === null ) {
43            return null;
44        }
45        $input = substr( $input, strlen( $match ) );
46        $match = $this->matchObsGrInner( $input, $begIncl );
47        if ( $match === null ) {
48            return null;
49        }
50
51        if ( $endIncl === "0" ) {
52            throw new RuntimeException( "error in condition, check next loc " );
53        }
54        $e = $this->findObserveGroupsInner( $input, strlen( $match ),
55            MhchemUtil::issetJS( $endIncl ) ? $endIncl : $endExcl );
56        if ( $e === null ) {
57            return null;
58        }
59        $match1 = substr( $input, 0, ( $endIncl ? $e["endMatchEnd"] : $e["endMatchBegin"] ) );
60
61        if ( !( MhchemUtil::issetJS( $beg2Excl ) || MhchemUtil::issetJS( $beg2Incl ) ) ) {
62            return [
63                "match_" => $match1,
64                "remainder" => substr( $input, $e["endMatchEnd"] )
65            ];
66        } else {
67            $group2 = $this->findObserveGroups( substr( $input, $e["endMatchEnd"] ),
68                $beg2Excl, $beg2Incl, $end2Incl, $end2Excl );
69            if ( $group2 === null ) {
70                return null;
71            }
72            $matchRet = [ $match1, $group2["match_"] ];
73            return [
74                "match_" => ( $combine ? implode( "", $matchRet ) : $matchRet ),
75                "remainder" => $group2["remainder"]
76            ];
77        }
78    }
79
80    private function matchObsGrInner( string $input, $pattern ) {
81        /**
82         * In javascript this is checking if the incoming pattern is a string,
83         * if not the assumption is that it is of regex type. Since PHP has
84         * strings here.
85         */
86        if ( !$pattern instanceof Reg ) {
87            // Added this if to catch empty needle for strpos input  in PHP
88            if ( !MhchemUtil::issetJS( $pattern ) || str_starts_with( $input, $pattern ) ) {
89                return $pattern;
90            }
91        } elseif ( preg_match( $pattern->getRegExp(), $input, $matches ) ) {
92            return $matches[0];
93        }
94        return null;
95    }
96
97    private function findObserveGroupsInner( string $input, $i, $endChars ): ?array {
98        $braces = 0;
99        while ( $i < strlen( $input ) ) {
100            $a = $input[$i];
101            $match = $this->matchObsGrInner( substr( $input, $i ), $endChars );
102            if ( $match !== null && $braces === 0 ) {
103                return [ "endMatchBegin" => $i, "endMatchEnd" => $i + strlen( $match ) ];
104            } elseif ( $a === "{" ) {
105                $braces++;
106            } elseif ( $a === "}" ) {
107                if ( $braces === 0 ) {
108                    // Unexpected character
109                    throw new RuntimeException(
110                        "ExtraCloseMissingOpen: Extra close brace or missing open brace" );
111                } else {
112                    $braces--;
113                }
114            }
115            $i++;
116        }
117        return null;
118    }
119
120    public function __construct() {
121        $this->patterns = [
122            'empty' => new Reg( "/^$/" ),
123            'else' => new Reg( "/^./" ),
124            'else2' => new Reg( "/^./" ),
125            'space' => new Reg( "/^\s/" ),
126            'space A' => new Reg( "/^\s(?=[A-Z\\\\$])/" ),
127            'space$' => new Reg( "/^\s$/" ),
128            'a-z' => new Reg( "/^[a-z]/" ),
129            'x' => new Reg( "/^x/" ),
130            'x$' => new Reg( "/^x$/" ),
131            'i$' => new Reg( "/^i$/" ),
132            'letters' => new Reg(
133                "/^(?:[a-zA-Z\x{03B1}-\x{03C9}\x{0391}-\x{03A9}?@]|(?:\\\\(?:alpha|beta|gamma|delta|epsilon"
134                . "|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega|Gamma"
135                . "|Delta|Theta|Lambda|Xi|Pi|Sigma|Upsilon|Phi|Psi|Omega)(?:\s+|\{\}|(?![a-zA-Z]))))+/u" ),
136            '\\greek' => new Reg(
137                "/^\\\\(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi"
138                . "|rho|sigma|tau|upsilon|phi|chi|psi|omega|Gamma|Delta|Theta|Lambda|Xi|Pi|Sigma|Upsilon|Phi|Psi|Omega)"
139                . "(?:\s+|\{\}|(?![a-zA-Z]))/" ),
140            'one lowercase latin letter $' => new Reg( "/^(?:([a-z])(?:$|[^a-zA-Z]))$/" ),
141            '$one lowercase latin letter$ $' => new Reg( "/^\\\$(?:([a-z])(?:$|[^a-zA-Z]))\\\$$/" ),
142            'one lowercase greek letter $' => new Reg(
143                "/^(?:\\\$?[\x{003B1}-\x{0003C9}]\\\$?|\\\$?\\\\(?:alpha|beta|gamma|" .
144                "delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|" .
145                "phi|chi|psi|omega)\s*\\\$?)(?:\s+|\{\}|(?![a-zA-Z]))$/u" ),
146            'digits' => new Reg( "/^[0-9]+/" ),
147            '-9.,9' => new Reg( "/^[+\-]?(?:[0-9]+(?:[,.][0-9]+)?|[0-9]*(?:\.[0-9]+))/" ),
148            '-9.,9 no missing 0' => new Reg( "/^[+\-]?[0-9]+(?:[.,][0-9]+)?/" ),
149            '(-)(9.,9)(e)(99)' => static function ( $input ) {
150                $matches = [];
151                $match = preg_match( "/^(\+\-|\+\/\-|\+|\-|\\\\pm\s?)?([0-9]+(?:[,.][0-9]+)?|" .
152                    "[0-9]*(?:\.[0-9]+))?(\((?:[0-9]+(?:[,.][0-9]+)?|[0-9]*(?:\.[0-9]+))\))?(?:(?:([eE])" .
153                    "|\s*(\*|x|\\\\times|\x{00D7})\s*10\^)([+\-]?[0-9]+|\{[+\-]?[0-9]+\}))?/u", $input, $matches );
154                if ( $match && $matches[0] ) {
155                    // could also match ""
156                    return [ "match_" => array_slice( $matches, 1 ),
157                        "remainder" => substr( $input, strlen( $matches[0] ) ) ];
158                }
159                return null;
160            },
161            '(-)(9)^(-9)' => new Reg( "/^(\+\-|\+\/\-|\+|\-|\\\\pm\s?)?([0-9]+(?:[,.][0-9]+)?|"
162                . "[0-9]*(?:\.[0-9]+)?)\^([+\-]?[0-9]+|\{[+\-]?[0-9]+\})/" ),
163            'state of aggregation $' => function ( $input ) {
164                // ... or crystal system
165                $a = $this->findObserveGroups( $input, "",
166                    new Reg( "/^\([a-z]{1,3}(?=[\),])/" ), ")", "" );
167                if ( $a && preg_match( "/^($|[\s,;\)\]\}])/", $a["remainder"] ) ) {
168                    return $a;
169                }
170                $matches = [];
171                $match = preg_match( "/^(?:\((?:\\\\ca\s?)?\\\$[amothc]\\\$\))/", $input, $matches );
172                if ( $match ) {
173                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
174                }
175                return null;
176            },
177            '_{(state of aggregation)}$' => new Reg( "/^_\{(\([a-z]{1,3}\))\}/" ),
178            '{[(' => new Reg( "/^(?:\\\{|\[|\()/" ),
179            ')]}' => new Reg( "/^(?:\)|\]|\\\})/" ),
180            ', ' => new Reg( "/^[,;]\s*/" ),
181            ',' => new Reg( "/^[,;]/" ),
182            '.' => new Reg( "/^[.]/" ),
183            '. __* ' => new Reg( "/^([.\x{22C5}\x{00B7}\x{2022}]|[*])\s*/u" ),
184            '...' => new Reg( "/^\.\.\.(?=$|[^.])/" ),
185            '^{(...)}' => function ( $input ) {
186                    return $this->findObserveGroups( $input, "^{", "", "", "}" );
187            },
188            '^($...$)' => function ( $input ) {
189                    return $this->findObserveGroups( $input, "^", "$", "$", "" );
190            },
191            '^a' => new Reg( "/^\^([0-9]+|[^\\\_])/u" ),
192            '^\\x{}{}' => function ( $input ) {
193                    return $this->findObserveGroups( $input, "^",
194                        new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "", "",
195                        "{", "}", "", true );
196            },
197            '^\\x{}' => function ( $input ) {
198                    return $this->findObserveGroups( $input, "^",
199                        new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "" );
200            },
201            '^\\x' => new Reg( "/^\^(\\\\[a-zA-Z]+)\s*/" ),
202            '^(-1)' => new Reg( "/^\^(-?\d+)/" ),
203            '\'' => new Reg( "/^'/" ),
204            '_{(...)}' => function ( $input ) {
205                return $this->findObserveGroups( $input, "_{", "", "", "}" );
206            },
207            '_($...$)' => function ( $input ) {
208                return $this->findObserveGroups( $input, "_", "$", "$", "" );
209            },
210            '_9' => new Reg( "/^_([+\-]?[0-9]+|[^\\\\])/" ),
211            '_\\x{}{}' => function ( $input ) {
212                return $this->findObserveGroups( $input, "_", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
213                    "", "", "{", "}", "", true );
214            },
215            '_\\x{}' => function ( $input ) {
216                return $this->findObserveGroups( $input, "_",
217                    new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "" );
218            },
219            '_\\x' => new Reg( "/^_(\\\\[a-zA-Z]+)\s*/" ),
220            '^_' => new Reg( "/^(?:\^(?=_)|\_(?=\^)|[\^_]$)/" ),
221            '{}^' => new Reg( "/^\{\}(?=\^)/" ),
222            '{}' => new Reg( "/^\{\}/" ),
223            '{...}' => function ( $input ) {
224                return $this->findObserveGroups( $input, "", "{", "}", "" );
225            },
226            '{(...)}' => function ( $input ) {
227                return $this->findObserveGroups( $input, "{", "", "", "}" );
228            },
229            '$...$' => function ( $input ) {
230                return $this->findObserveGroups( $input, "", "\$", "\$", "" );
231            },
232            '${(...)}$__$(...)$' => function ( $input ) {
233                return $this->findObserveGroups( $input, "\${", "", "", "}\$" )
234                    ?? $this->findObserveGroups( $input, "\$", "", "", "\$" );
235            },
236            '=<>' => new Reg( "/^[=<>]/" ),
237            '#' => new Reg( "/^[#\x{2261}]/u" ),
238            '+' => new Reg( "/^\+/" ),
239            // -space -, -; -] -/ -$ -state-of-aggregation orig:  "/^-(?=[\s_},;\]/]|$|\([a-z]+\))/"
240            '-$' => new Reg( "/^-(?=[\s_},;\]\/]|$|\([a-z]+\))/u" ),
241            '-9' => new Reg( "/^-(?=[0-9])/" ),
242            '- orbital overlap' => new Reg( "/^-(?=(?:[spd]|sp)(?:$|[\s,;\)\]\}]))/" ),
243            '-' => new Reg( "/^-/" ),
244            'pm-operator' => new Reg( "/^(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)/" ),
245            'operator' => new Reg( "/^(?:\+|(?:[\-=<>]|<<|>>|\\\\approx|\\\$\\\\approx\\\$)(?=\s|$|-?[0-9]))/" ),
246            'arrowUpDown' => new Reg( "/^(?:v|\(v\)|\^|\(\^\))(?=$|[\s,;\)\]\}])/" ),
247            '\\bond{(...)}' => function ( $input ) {
248                return $this->findObserveGroups( $input, "\\bond{", "", "", "}" );
249            },
250            '->' => new Reg( '/^(?:<->|<-->|->|<-|<=>>|<<=>|<=>|[\x{2192}\x{27F6}\x{21CC}])/u' ),
251            'CMT' => new Reg( "/^[CMT](?=\[)/" ),
252            '[(...)]' => function ( $input ) { return $this->findObserveGroups( $input, "[", "",
253                "", "]" );
254            },
255            '1st-level escape' => new Reg( "/^(&|\\\\\\\\|\\\\hline)\s*/" ),
256            // \\x - but output no space before
257            '\\,' => new Reg( "/^(?:\\\\[,\ ;:])/" ),
258            '\\x{}{}' => function ( $input ) {
259                return $this->findObserveGroups( $input, "", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
260                    "", "", "{", "}", "", true );
261            },
262            '\\x{}' => function ( $input ) {
263                return $this->findObserveGroups( $input, "", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
264                    "" );
265            },
266            '\\ca' => new Reg( "/^\\\\ca(?:\s+|(?![a-zA-Z]))/" ),
267            '\\x' => new Reg( "/^(?:\\\\[a-zA-Z]+\s*|\\\\[_&{}%])/" ),
268            // only those with numbers in front, because the others will be formatted correctly anyway
269            'orbital' => new Reg( "/^(?:[0-9]{1,2}[spdfgh]|[0-9]{0,2}sp)(?=$|[^a-zA-Z])/" ),
270            'others' => new Reg( "/^[\/~|]/" ),
271            '\\frac{(...)}' => function ( $input ) {
272                    return $this->findObserveGroups( $input, "\\frac{", "",
273                        "", "}", "{", "", "", "}" );
274            },
275            '\\overset{(...)}' => function ( $input ) {
276                    return $this->findObserveGroups( $input, "\\overset{", "",
277                        "", "}", "{", "", "", "}" );
278            },
279            '\\underset{(...)}' => function ( $input ) {
280                    return $this->findObserveGroups( $input, "\\underset{", "",
281                        "", "}", "{", "", "", "}" );
282            },
283            '\\underbrace{(...)}' => function ( $input ) {
284                    return $this->findObserveGroups( $input, "\\underbrace{", "",
285                        "", "}_", "{", "", "", "}" );
286            },
287            '\\color{(...)}' => function ( $input ) {
288                    return $this->findObserveGroups( $input, "\\color{", "", "", "}" );
289            },
290            '\\color{(...)}{(...)}' => function ( $input ) {
291                // ?? instead of ||
292                return $this->findObserveGroups( $input, "\\color{", "",
293                    "", "}", "{", "", "", "}" ) ??
294                $this->findObserveGroups( $input, "\\color", "\\", "",
295                    new Reg( "/^(?=\{)/" ), "{", "", "", "}" );
296            },
297            '\\ce{(...)}' => function ( $input ) {
298                return $this->findObserveGroups( $input, "\\ce{", "", "", "}" );
299            },
300            '\\pu{(...)}' => function ( $input ) { return $this->findObserveGroups( $input,
301                "\\pu{", "", "", "}" );
302            },
303            'oxidation$' => new Reg( "/^(?:[+-][IVX]+|(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)\s*0)$/" ),
304            'd-oxidation$' => new Reg( "/^(?:[+-]?[IVX]+|(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)\s*0)$/" ),
305            '1/2$' => new Reg( "/^[+\-]?(?:[0-9]+|\\\$[a-z]\\\$|[a-z])\/[0-9]+(?:\\\$[a-z]\\\$|[a-z])?$/" ),
306            'amount' => function ( $input ) {
307                $matches = [];
308                // e.g. 2, 0.5, 1/2, -2, n/2, +;  $a$ could be added later in parsing
309                $match = preg_match( "/^(?:(?:(?:\([+\-]?[0-9]+\/[0-9]+\)|[+\-]?(?:[0-9]+|\\\$[a-z]\\\$" .
310                    "|[a-z])\/[0-9]+|[+\-]?[0-9]+[.,][0-9]+|[+\-]?\.[0-9]+|[+\-]?[0-9]+)(?:[a-z](?=\s*[A-Z]))?)" .
311                    "|[+\-]?[a-z](?=\s*[A-Z])|\+(?!\s))/", $input, $matches );
312
313                if ( $match ) {
314                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
315                }
316                $a = $this->findObserveGroups( $input, "", "$", "$", "" );
317                // e.g. $2n-1$, $-$
318                if ( MhchemUtil::issetJS( $a ) ) {
319                    $matchesI = [];
320
321                    $match = preg_match( "/^\\\$(?:\(?[+\-]?(?:[0-9]*[a-z]?[+\-])" .
322                        "?[0-9]*[a-z](?:[+\-][0-9]*[a-z]?)?\)?|\+|-)\\\$$/", $a["match_"] ?? "",
323                        $matchesI );
324                    if ( $match ) {
325                        return [ "match_" => $matchesI[0], "remainder" => substr( $input, strlen( $matchesI[0] ) ) ];
326                    }
327                }
328                return null;
329            },
330            'amount2' => function ( $input ) {
331                /* @phan-suppress-next-line PhanInfiniteRecursion, PhanUndeclaredInvokeInCallable */
332                return $this->patterns['amount']( $input );
333            },
334            '(KV letters),' => new Reg( "/^(?:[A-Z][a-z]{0,2}|i)(?=,)/" ),
335            'formula$' => static function ( $input ) {
336                if ( preg_match( "/^\([a-z]+\)$/", $input ) ) {
337                    // state of aggregation = no formula
338                    return null;
339                }
340                $matches = [];
341                $match = preg_match( "/^(?:[a-z]|(?:[0-9\ \+\-\,\.\(\)]+[a-z])+[0-9\ \+\-\,\.\(\)]*|"
342                    . "(?:[a-z][0-9\ \+\-\,\.\(\)]+)+[a-z]?)$/", $input, $matches );
343                if ( $match ) {
344                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
345                }
346                return null;
347            },
348            'uprightEntities' => new Reg( "/^(?:pH|pOH|pC|pK|iPr|iBu)(?=$|[^a-zA-Z])/" ),
349            '/' => new Reg( "/^\s*(\/)\s*/" ),
350            '//' => new Reg( "/^\s*(\/\/)\s*/" ),
351            '*' => new Reg( "/^\s*[*.]\s*/" )
352        ];
353    }
354
355    /**
356     * Matching function
357     * e.g. match("a", input) will look for the regexp called "a" and see if it matches
358     * returns null or {match_:"a", remainder:"bc"}
359     * @param string $m key for fetching a pattern
360     * @param string $input string to check
361     * @return array|mixed|null information about the match
362     */
363    public function match( string $m, string $input ) {
364        $pattern = $this->patterns[$m] ?? null;
365        if ( !$pattern ) {
366            // Trying to use non-existing pattern
367            throw new RuntimeException( "MhchemBugP: mhchem bug P. Please report. (" . $m . ")" );
368        } elseif ( $pattern instanceof Reg ) {
369            $matches = [];
370            $match = preg_match( $pattern->getRegExp(), $input, $matches );
371            if ( $match ) {
372                if ( count( $matches ) > 2 ) {
373                    return [
374                        "match_" => array_slice( $matches, 1 ),
375                        "remainder" => substr( $input, strlen( $matches[0] ) )
376                    ];
377
378                } else {
379                    return [
380                        "match_" => MhchemUtil::issetJS( $matches[1] ?? null ) ? $matches[1] : $matches[0],
381                        "remainder" => substr( $input, strlen( $matches[0] ) )
382                    ];
383                }
384            }
385            return null;
386        } elseif ( is_callable( $pattern ) ) {
387            // $pattern cannot be an instance of MhchemRegExp here, which causes this warning.
388            /* @phan-suppress-next-line PhanUndeclaredInvokeInCallable */
389            return $this->patterns[$m]( $input );
390        } else {
391            return null;
392        }
393    }
394}