Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 282
0.00% covered (danger)
0.00%
0 / 6
CRAP
0.00% covered (danger)
0.00%
0 / 1
MhchemPatterns
0.00% covered (danger)
0.00%
0 / 282
0.00% covered (danger)
0.00%
0 / 6
1806
0.00% covered (danger)
0.00%
0 / 1
 getPatterns
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 findObserveGroups
0.00% covered (danger)
0.00%
0 / 28
0.00% covered (danger)
0.00%
0 / 1
132
 matchObsGrInner
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
30
 findObserveGroupsInner
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
56
 __construct
0.00% covered (danger)
0.00%
0 / 212
0.00% covered (danger)
0.00%
0 / 1
132
 match
0.00% covered (danger)
0.00%
0 / 20
0.00% covered (danger)
0.00%
0 / 1
56
1<?php
2/**
3 * Copyright (c) 2023 Johannes Stegmüller
4 *
5 * This file is a port of mhchemParser originally authored by Martin Hensel in javascript/typescript.
6 * The original license for this software can be found in the accompanying LICENSE.mhchemParser-ts.txt file.
7 */
8
9declare( strict_types = 1 );
10
11namespace MediaWiki\Extension\Math\WikiTexVC\Mhchem;
12
13use MediaWiki\Extension\Math\WikiTexVC\Mhchem\MhchemRegExp as Reg;
14use RuntimeException;
15
16/**
17 * Contains all matching regex patterns and match functions for mhchemParser in PHP.
18 *
19 * corresponds mostly to the 'patterns' array in line ~207 in mhchemParser.js by Martin Hensel
20 *
21 * @author Johannes Stegmüller
22 * @license GPL-2.0-or-later
23 */
24class MhchemPatterns {
25
26    /** @var array */
27    private array $patterns;
28
29    /**
30     * Matching patterns
31     * either regexes or function that return null or {match_:"a", remainder:"bc"}
32     * @return array
33     */
34    public function getPatterns(): array {
35        return $this->patterns;
36    }
37
38    /**
39     * @param string $input
40     * @param string|Reg $begExcl
41     * @param string|Reg $begIncl
42     * @param mixed|null $endIncl
43     * @param mixed|null $endExcl
44     * @param string|Reg|null $beg2Excl
45     * @param string|Reg|null $beg2Incl
46     * @param mixed|null $end2Incl
47     * @param mixed|null $end2Excl
48     * @param bool|null $combine
49     */
50    public function findObserveGroups( $input, $begExcl, $begIncl, $endIncl,
51                                $endExcl = null, $beg2Excl = null, $beg2Incl = null,
52                                $end2Incl = null, $end2Excl = null, $combine = null ): ?array {
53        $match = $this->matchObsGrInner( $input, $begExcl );
54        if ( $match === null ) {
55            return null;
56        }
57        $input = substr( $input, strlen( $match ) );
58        $match = $this->matchObsGrInner( $input, $begIncl );
59        if ( $match === null ) {
60            return null;
61        }
62
63        if ( $endIncl === "0" ) {
64            throw new RuntimeException( "error in condition, check next loc " );
65        }
66        $e = $this->findObserveGroupsInner( $input, strlen( $match ),
67            MhchemUtil::issetJS( $endIncl ) ? $endIncl : $endExcl );
68        if ( $e === null ) {
69            return null;
70        }
71        $match1 = substr( $input, 0, ( $endIncl ? $e["endMatchEnd"] : $e["endMatchBegin"] ) );
72
73        if ( !( MhchemUtil::issetJS( $beg2Excl ) || MhchemUtil::issetJS( $beg2Incl ) ) ) {
74            return [
75                "match_" => $match1,
76                "remainder" => substr( $input, $e["endMatchEnd"] )
77            ];
78        } else {
79            $group2 = $this->findObserveGroups( substr( $input, $e["endMatchEnd"] ),
80                $beg2Excl, $beg2Incl, $end2Incl, $end2Excl );
81            if ( $group2 === null ) {
82                return null;
83            }
84            $matchRet = [ $match1, $group2["match_"] ];
85            return [
86                "match_" => ( $combine ? implode( "", $matchRet ) : $matchRet ),
87                "remainder" => $group2["remainder"]
88            ];
89        }
90    }
91
92    /**
93     * @param string $input
94     * @param string|Reg $pattern
95     * @return string|null
96     */
97    private function matchObsGrInner( string $input, $pattern ) {
98        /**
99         * In javascript this is checking if the incoming pattern is a string,
100         * if not the assumption is that it is of regex type. Since PHP has
101         * strings here.
102         */
103        if ( !$pattern instanceof Reg ) {
104            // Added this if to catch empty needle for strpos input  in PHP
105            if ( !MhchemUtil::issetJS( $pattern ) || str_starts_with( $input, $pattern ) ) {
106                return $pattern;
107            }
108        } elseif ( preg_match( $pattern->getRegExp(), $input, $matches ) ) {
109            return $matches[0];
110        }
111        return null;
112    }
113
114    /**
115     * @param string $input
116     * @param int $i
117     * @param string $endChars
118     * @return array|null
119     */
120    private function findObserveGroupsInner( string $input, $i, $endChars ): ?array {
121        $braces = 0;
122        while ( $i < strlen( $input ) ) {
123            $a = $input[$i];
124            $match = $this->matchObsGrInner( substr( $input, $i ), $endChars );
125            if ( $match !== null && $braces === 0 ) {
126                return [ "endMatchBegin" => $i, "endMatchEnd" => $i + strlen( $match ) ];
127            } elseif ( $a === "{" ) {
128                $braces++;
129            } elseif ( $a === "}" ) {
130                if ( $braces === 0 ) {
131                    // Unexpected character
132                    throw new RuntimeException(
133                        "ExtraCloseMissingOpen: Extra close brace or missing open brace" );
134                } else {
135                    $braces--;
136                }
137            }
138            $i++;
139        }
140        return null;
141    }
142
143    public function __construct() {
144        $this->patterns = [
145            'empty' => new Reg( "/^$/" ),
146            'else' => new Reg( "/^./" ),
147            'else2' => new Reg( "/^./" ),
148            'space' => new Reg( "/^\s/" ),
149            'space A' => new Reg( "/^\s(?=[A-Z\\\\$])/" ),
150            'space$' => new Reg( "/^\s$/" ),
151            'a-z' => new Reg( "/^[a-z]/" ),
152            'x' => new Reg( "/^x/" ),
153            'x$' => new Reg( "/^x$/" ),
154            'i$' => new Reg( "/^i$/" ),
155            'letters' => new Reg(
156                "/^(?:[a-zA-Z\x{03B1}-\x{03C9}\x{0391}-\x{03A9}?@]|(?:\\\\(?:alpha|beta|gamma|delta|epsilon"
157                . "|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega|Gamma"
158                . "|Delta|Theta|Lambda|Xi|Pi|Sigma|Upsilon|Phi|Psi|Omega)(?:\s+|\{\}|(?![a-zA-Z]))))+/u" ),
159            '\\greek' => new Reg(
160                "/^\\\\(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi"
161                . "|rho|sigma|tau|upsilon|phi|chi|psi|omega|Gamma|Delta|Theta|Lambda|Xi|Pi|Sigma|Upsilon|Phi|Psi|Omega)"
162                . "(?:\s+|\{\}|(?![a-zA-Z]))/" ),
163            'one lowercase latin letter $' => new Reg( "/^(?:([a-z])(?:$|[^a-zA-Z]))$/" ),
164            '$one lowercase latin letter$ $' => new Reg( "/^\\\$(?:([a-z])(?:$|[^a-zA-Z]))\\\$$/" ),
165            'one lowercase greek letter $' => new Reg(
166                "/^(?:\\\$?[\x{003B1}-\x{0003C9}]\\\$?|\\\$?\\\\(?:alpha|beta|gamma|" .
167                "delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|" .
168                "phi|chi|psi|omega)\s*\\\$?)(?:\s+|\{\}|(?![a-zA-Z]))$/u" ),
169            'digits' => new Reg( "/^[0-9]+/" ),
170            '-9.,9' => new Reg( "/^[+\-]?(?:[0-9]+(?:[,.][0-9]+)?|[0-9]*(?:\.[0-9]+))/" ),
171            '-9.,9 no missing 0' => new Reg( "/^[+\-]?[0-9]+(?:[.,][0-9]+)?/" ),
172            '(-)(9.,9)(e)(99)' => static function ( $input ) {
173                $matches = [];
174                $match = preg_match( "/^(\+\-|\+\/\-|\+|\-|\\\\pm\s?)?([0-9]+(?:[,.][0-9]+)?|" .
175                    "[0-9]*(?:\.[0-9]+))?(\((?:[0-9]+(?:[,.][0-9]+)?|[0-9]*(?:\.[0-9]+))\))?(?:(?:([eE])" .
176                    "|\s*(\*|x|\\\\times|\x{00D7})\s*10\^)([+\-]?[0-9]+|\{[+\-]?[0-9]+\}))?/u", $input, $matches );
177                if ( $match && $matches[0] ) {
178                    // could also match ""
179                    return [ "match_" => array_slice( $matches, 1 ),
180                        "remainder" => substr( $input, strlen( $matches[0] ) ) ];
181                }
182                return null;
183            },
184            '(-)(9)^(-9)' => new Reg( "/^(\+\-|\+\/\-|\+|\-|\\\\pm\s?)?([0-9]+(?:[,.][0-9]+)?|"
185                . "[0-9]*(?:\.[0-9]+)?)\^([+\-]?[0-9]+|\{[+\-]?[0-9]+\})/" ),
186            'state of aggregation $' => function ( $input ) {
187                // ... or crystal system
188                $a = $this->findObserveGroups( $input, "",
189                    new Reg( "/^\([a-z]{1,3}(?=[\),])/" ), ")", "" );
190                if ( $a && preg_match( "/^($|[\s,;\)\]\}])/", $a["remainder"] ) ) {
191                    return $a;
192                }
193                $matches = [];
194                $match = preg_match( "/^(?:\((?:\\\\ca\s?)?\\\$[amothc]\\\$\))/", $input, $matches );
195                if ( $match ) {
196                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
197                }
198                return null;
199            },
200            '_{(state of aggregation)}$' => new Reg( "/^_\{(\([a-z]{1,3}\))\}/" ),
201            '{[(' => new Reg( "/^(?:\\\{|\[|\()/" ),
202            ')]}' => new Reg( "/^(?:\)|\]|\\\})/" ),
203            ', ' => new Reg( "/^[,;]\s*/" ),
204            ',' => new Reg( "/^[,;]/" ),
205            '.' => new Reg( "/^[.]/" ),
206            '. __* ' => new Reg( "/^([.\x{22C5}\x{00B7}\x{2022}]|[*])\s*/u" ),
207            '...' => new Reg( "/^\.\.\.(?=$|[^.])/" ),
208            '^{(...)}' => function ( $input ) {
209                    return $this->findObserveGroups( $input, "^{", "", "", "}" );
210            },
211            '^($...$)' => function ( $input ) {
212                    return $this->findObserveGroups( $input, "^", "$", "$", "" );
213            },
214            '^a' => new Reg( "/^\^([0-9]+|[^\\\_])/u" ),
215            '^\\x{}{}' => function ( $input ) {
216                    return $this->findObserveGroups( $input, "^",
217                        new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "", "",
218                        "{", "}", "", true );
219            },
220            '^\\x{}' => function ( $input ) {
221                    return $this->findObserveGroups( $input, "^",
222                        new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "" );
223            },
224            '^\\x' => new Reg( "/^\^(\\\\[a-zA-Z]+)\s*/" ),
225            '^(-1)' => new Reg( "/^\^(-?\d+)/" ),
226            '\'' => new Reg( "/^'/" ),
227            '_{(...)}' => function ( $input ) {
228                return $this->findObserveGroups( $input, "_{", "", "", "}" );
229            },
230            '_($...$)' => function ( $input ) {
231                return $this->findObserveGroups( $input, "_", "$", "$", "" );
232            },
233            '_9' => new Reg( "/^_([+\-]?[0-9]+|[^\\\\])/" ),
234            '_\\x{}{}' => function ( $input ) {
235                return $this->findObserveGroups( $input, "_", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
236                    "", "", "{", "}", "", true );
237            },
238            '_\\x{}' => function ( $input ) {
239                return $this->findObserveGroups( $input, "_",
240                    new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "" );
241            },
242            '_\\x' => new Reg( "/^_(\\\\[a-zA-Z]+)\s*/" ),
243            '^_' => new Reg( "/^(?:\^(?=_)|\_(?=\^)|[\^_]$)/" ),
244            '{}^' => new Reg( "/^\{\}(?=\^)/" ),
245            '{}' => new Reg( "/^\{\}/" ),
246            '{...}' => function ( $input ) {
247                return $this->findObserveGroups( $input, "", "{", "}", "" );
248            },
249            '{(...)}' => function ( $input ) {
250                return $this->findObserveGroups( $input, "{", "", "", "}" );
251            },
252            '$...$' => function ( $input ) {
253                return $this->findObserveGroups( $input, "", "\$", "\$", "" );
254            },
255            '${(...)}$__$(...)$' => function ( $input ) {
256                return $this->findObserveGroups( $input, "\${", "", "", "}\$" )
257                    ?? $this->findObserveGroups( $input, "\$", "", "", "\$" );
258            },
259            '=<>' => new Reg( "/^[=<>]/" ),
260            '#' => new Reg( "/^[#\x{2261}]/u" ),
261            '+' => new Reg( "/^\+/" ),
262            // -space -, -; -] -/ -$ -state-of-aggregation orig:  "/^-(?=[\s_},;\]/]|$|\([a-z]+\))/"
263            '-$' => new Reg( "/^-(?=[\s_},;\]\/]|$|\([a-z]+\))/u" ),
264            '-9' => new Reg( "/^-(?=[0-9])/" ),
265            '- orbital overlap' => new Reg( "/^-(?=(?:[spd]|sp)(?:$|[\s,;\)\]\}]))/" ),
266            '-' => new Reg( "/^-/" ),
267            'pm-operator' => new Reg( "/^(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)/" ),
268            'operator' => new Reg( "/^(?:\+|(?:[\-=<>]|<<|>>|\\\\approx|\\\$\\\\approx\\\$)(?=\s|$|-?[0-9]))/" ),
269            'arrowUpDown' => new Reg( "/^(?:v|\(v\)|\^|\(\^\))(?=$|[\s,;\)\]\}])/" ),
270            '\\bond{(...)}' => function ( $input ) {
271                return $this->findObserveGroups( $input, "\\bond{", "", "", "}" );
272            },
273            '->' => new Reg( '/^(?:<->|<-->|->|<-|<=>>|<<=>|<=>|[\x{2192}\x{27F6}\x{21CC}])/u' ),
274            'CMT' => new Reg( "/^[CMT](?=\[)/" ),
275            '[(...)]' => function ( $input ) { return $this->findObserveGroups( $input, "[", "",
276                "", "]" );
277            },
278            '1st-level escape' => new Reg( "/^(&|\\\\\\\\|\\\\hline)\s*/" ),
279            // \\x - but output no space before
280            '\\,' => new Reg( "/^(?:\\\\[,\ ;:])/" ),
281            '\\x{}{}' => function ( $input ) {
282                return $this->findObserveGroups( $input, "", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
283                    "", "", "{", "}", "", true );
284            },
285            '\\x{}' => function ( $input ) {
286                return $this->findObserveGroups( $input, "", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
287                    "" );
288            },
289            '\\ca' => new Reg( "/^\\\\ca(?:\s+|(?![a-zA-Z]))/" ),
290            '\\x' => new Reg( "/^(?:\\\\[a-zA-Z]+\s*|\\\\[_&{}%])/" ),
291            // only those with numbers in front, because the others will be formatted correctly anyway
292            'orbital' => new Reg( "/^(?:[0-9]{1,2}[spdfgh]|[0-9]{0,2}sp)(?=$|[^a-zA-Z])/" ),
293            'others' => new Reg( "/^[\/~|]/" ),
294            '\\frac{(...)}' => function ( $input ) {
295                    return $this->findObserveGroups( $input, "\\frac{", "",
296                        "", "}", "{", "", "", "}" );
297            },
298            '\\overset{(...)}' => function ( $input ) {
299                    return $this->findObserveGroups( $input, "\\overset{", "",
300                        "", "}", "{", "", "", "}" );
301            },
302            '\\underset{(...)}' => function ( $input ) {
303                    return $this->findObserveGroups( $input, "\\underset{", "",
304                        "", "}", "{", "", "", "}" );
305            },
306            '\\underbrace{(...)}' => function ( $input ) {
307                    return $this->findObserveGroups( $input, "\\underbrace{", "",
308                        "", "}_", "{", "", "", "}" );
309            },
310            '\\color{(...)}' => function ( $input ) {
311                    return $this->findObserveGroups( $input, "\\color{", "", "", "}" );
312            },
313            '\\color{(...)}{(...)}' => function ( $input ) {
314                // ?? instead of ||
315                return $this->findObserveGroups( $input, "\\color{", "",
316                    "", "}", "{", "", "", "}" ) ??
317                $this->findObserveGroups( $input, "\\color", "\\", "",
318                    new Reg( "/^(?=\{)/" ), "{", "", "", "}" );
319            },
320            '\\ce{(...)}' => function ( $input ) {
321                return $this->findObserveGroups( $input, "\\ce{", "", "", "}" );
322            },
323            '\\pu{(...)}' => function ( $input ) { return $this->findObserveGroups( $input,
324                "\\pu{", "", "", "}" );
325            },
326            'oxidation$' => new Reg( "/^(?:[+-][IVX]+|(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)\s*0)$/" ),
327            'd-oxidation$' => new Reg( "/^(?:[+-]?[IVX]+|(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)\s*0)$/" ),
328            '1/2$' => new Reg( "/^[+\-]?(?:[0-9]+|\\\$[a-z]\\\$|[a-z])\/[0-9]+(?:\\\$[a-z]\\\$|[a-z])?$/" ),
329            'amount' => function ( $input ) {
330                $matches = [];
331                // e.g. 2, 0.5, 1/2, -2, n/2, +;  $a$ could be added later in parsing
332                $match = preg_match( "/^(?:(?:(?:\([+\-]?[0-9]+\/[0-9]+\)|[+\-]?(?:[0-9]+|\\\$[a-z]\\\$" .
333                    "|[a-z])\/[0-9]+|[+\-]?[0-9]+[.,][0-9]+|[+\-]?\.[0-9]+|[+\-]?[0-9]+)(?:[a-z](?=\s*[A-Z]))?)" .
334                    "|[+\-]?[a-z](?=\s*[A-Z])|\+(?!\s))/", $input, $matches );
335
336                if ( $match ) {
337                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
338                }
339                $a = $this->findObserveGroups( $input, "", "$", "$", "" );
340                // e.g. $2n-1$, $-$
341                if ( MhchemUtil::issetJS( $a ) ) {
342                    $matchesI = [];
343
344                    $match = preg_match( "/^\\\$(?:\(?[+\-]?(?:[0-9]*[a-z]?[+\-])" .
345                        "?[0-9]*[a-z](?:[+\-][0-9]*[a-z]?)?\)?|\+|-)\\\$$/", $a["match_"] ?? "",
346                        $matchesI );
347                    if ( $match ) {
348                        return [ "match_" => $matchesI[0], "remainder" => substr( $input, strlen( $matchesI[0] ) ) ];
349                    }
350                }
351                return null;
352            },
353            'amount2' => function ( $input ) {
354                /* @phan-suppress-next-line PhanInfiniteRecursion, PhanUndeclaredInvokeInCallable */
355                return $this->patterns['amount']( $input );
356            },
357            '(KV letters),' => new Reg( "/^(?:[A-Z][a-z]{0,2}|i)(?=,)/" ),
358            'formula$' => static function ( $input ) {
359                if ( preg_match( "/^\([a-z]+\)$/", $input ) ) {
360                    // state of aggregation = no formula
361                    return null;
362                }
363                $matches = [];
364                $match = preg_match( "/^(?:[a-z]|(?:[0-9\ \+\-\,\.\(\)]+[a-z])+[0-9\ \+\-\,\.\(\)]*|"
365                    . "(?:[a-z][0-9\ \+\-\,\.\(\)]+)+[a-z]?)$/", $input, $matches );
366                if ( $match ) {
367                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
368                }
369                return null;
370            },
371            'uprightEntities' => new Reg( "/^(?:pH|pOH|pC|pK|iPr|iBu)(?=$|[^a-zA-Z])/" ),
372            '/' => new Reg( "/^\s*(\/)\s*/" ),
373            '//' => new Reg( "/^\s*(\/\/)\s*/" ),
374            '*' => new Reg( "/^\s*[*.]\s*/" )
375        ];
376    }
377
378    /**
379     * Matching function
380     * e.g. match("a", input) will look for the regexp called "a" and see if it matches
381     * returns null or {match_:"a", remainder:"bc"}
382     * @param string $m key for fetching a pattern
383     * @param string $input string to check
384     * @return array|mixed|null information about the match
385     */
386    public function match( string $m, string $input ) {
387        $pattern = $this->patterns[$m] ?? null;
388        if ( !$pattern ) {
389            // Trying to use non-existing pattern
390            throw new RuntimeException( "MhchemBugP: mhchem bug P. Please report. (" . $m . ")" );
391        } elseif ( $pattern instanceof Reg ) {
392            $matches = [];
393            $match = preg_match( $pattern->getRegExp(), $input, $matches );
394            if ( $match ) {
395                if ( count( $matches ) > 2 ) {
396                    return [
397                        "match_" => array_slice( $matches, 1 ),
398                        "remainder" => substr( $input, strlen( $matches[0] ) )
399                    ];
400
401                } else {
402                    return [
403                        "match_" => MhchemUtil::issetJS( $matches[1] ?? null ) ? $matches[1] : $matches[0],
404                        "remainder" => substr( $input, strlen( $matches[0] ) )
405                    ];
406                }
407            }
408            return null;
409        } elseif ( is_callable( $pattern ) ) {
410            // $pattern cannot be an instance of MhchemRegExp here, which causes this warning.
411            /* @phan-suppress-next-line PhanUndeclaredInvokeInCallable */
412            return $this->patterns[$m]( $input );
413        } else {
414            return null;
415        }
416    }
417}