Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 287
0.00% covered (danger)
0.00%
0 / 6
CRAP
0.00% covered (danger)
0.00%
0 / 1
MhchemPatterns
0.00% covered (danger)
0.00%
0 / 287
0.00% covered (danger)
0.00%
0 / 6
1806
0.00% covered (danger)
0.00%
0 / 1
 getPatterns
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 findObserveGroups
0.00% covered (danger)
0.00%
0 / 28
0.00% covered (danger)
0.00%
0 / 1
132
 matchObsGrInner
0.00% covered (danger)
0.00%
0 / 11
0.00% covered (danger)
0.00%
0 / 1
30
 findObserveGroupsInner
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
56
 __construct
0.00% covered (danger)
0.00%
0 / 212
0.00% covered (danger)
0.00%
0 / 1
132
 match
0.00% covered (danger)
0.00%
0 / 20
0.00% covered (danger)
0.00%
0 / 1
56
1<?php
2/**
3 * Copyright (c) 2023 Johannes Stegmüller
4 *
5 * This file is a port of mhchemParser originally authored by Martin Hensel in javascript/typescript.
6 * The original license for this software can be found in the accompanying LICENSE.mhchemParser-ts.txt file.
7 */
8
9declare( strict_types = 1 );
10
11namespace MediaWiki\Extension\Math\WikiTexVC\Mhchem;
12
13use MediaWiki\Extension\Math\WikiTexVC\Mhchem\MhchemRegExp as Reg;
14use RuntimeException;
15
16/**
17 * Contains all matching regex patterns and match functions for mhchemParser in PHP.
18 *
19 * corresponds mostly to the 'patterns' array in line ~207 in mhchemParser.js by Martin Hensel
20 *
21 * @author Johannes Stegmüller
22 * @license GPL-2.0-or-later
23 */
24class MhchemPatterns {
25
26    /** @var array */
27    private array $patterns;
28
29    /**
30     * Matching patterns
31     * either regexes or function that return null or {match_:"a", remainder:"bc"}
32     * @return array
33     */
34    public function getPatterns(): array {
35        return $this->patterns;
36    }
37
38    public function findObserveGroups( $input, $begExcl, $begIncl, $endIncl,
39                                $endExcl = null, $beg2Excl = null, $beg2Incl = null,
40                                $end2Incl = null, $end2Excl = null, $combine = null ): ?array {
41        $match = $this->matchObsGrInner( $input, $begExcl );
42        if ( $match === null ) {
43            return null;
44        }
45        $input = substr( $input, strlen( $match ) );
46        $match = $this->matchObsGrInner( $input, $begIncl );
47        if ( $match === null ) {
48            return null;
49        }
50
51        if ( $endIncl === "0" ) {
52            throw new RuntimeException( "error in condition, check next loc " );
53        }
54        $e = $this->findObserveGroupsInner( $input, strlen( $match ),
55            MhchemUtil::issetJS( $endIncl ) ? $endIncl : $endExcl );
56        if ( $e === null ) {
57            return null;
58        }
59        $match1 = substr( $input, 0, ( $endIncl ? $e["endMatchEnd"] : $e["endMatchBegin"] ) );
60
61        if ( !( MhchemUtil::issetJS( $beg2Excl ) || MhchemUtil::issetJS( $beg2Incl ) ) ) {
62            return [
63                "match_" => $match1,
64                "remainder" => substr( $input, $e["endMatchEnd"] )
65            ];
66        } else {
67            $group2 = $this->findObserveGroups( substr( $input, $e["endMatchEnd"] ),
68                $beg2Excl, $beg2Incl, $end2Incl, $end2Excl );
69            if ( $group2 === null ) {
70                return null;
71            }
72            $matchRet = [ $match1, $group2["match_"] ];
73            return [
74                "match_" => ( $combine ? implode( "", $matchRet ) : $matchRet ),
75                "remainder" => $group2["remainder"]
76            ];
77        }
78    }
79
80    private function matchObsGrInner( string $input, $pattern ) {
81        /**
82         * In javascript this is checking if the incoming pattern is a string,
83         * if not the assumption is that it is of regex type. Since PHP has
84         * strings here.
85         */
86        if ( !$pattern instanceof Reg ) {
87            // Added this if to catch empty needle for strpos input  in PHP
88            if ( !MhchemUtil::issetJS( $pattern ) ) {
89                return $pattern;
90            }
91            if ( strpos( $input, $pattern ) !== 0 ) {
92                return null;
93            }
94            return $pattern;
95        } else {
96            $matches = [];
97            $match = preg_match( $pattern->getRegExp(), $input, $matches );
98            if ( !$match ) {
99                return null;
100            }
101            return $matches[0];
102        }
103    }
104
105    private function findObserveGroupsInner( string $input, $i, $endChars ): ?array {
106        $braces = 0;
107        while ( $i < strlen( $input ) ) {
108            $a = $input[$i];
109            $match = $this->matchObsGrInner( substr( $input, $i ), $endChars );
110            if ( $match !== null && $braces === 0 ) {
111                return [ "endMatchBegin" => $i, "endMatchEnd" => $i + strlen( $match ) ];
112            } elseif ( $a === "{" ) {
113                $braces++;
114            } elseif ( $a === "}" ) {
115                if ( $braces === 0 ) {
116                    // Unexpected character
117                    throw new RuntimeException(
118                        "ExtraCloseMissingOpen: Extra close brace or missing open brace" );
119                } else {
120                    $braces--;
121                }
122            }
123            $i++;
124        }
125        return null;
126    }
127
128    public function __construct() {
129        $this->patterns = [
130            'empty' => new Reg( "/^$/" ),
131            'else' => new Reg( "/^./" ),
132            'else2' => new Reg( "/^./" ),
133            'space' => new Reg( "/^\s/" ),
134            'space A' => new Reg( "/^\s(?=[A-Z\\\\$])/" ),
135            'space$' => new Reg( "/^\s$/" ),
136            'a-z' => new Reg( "/^[a-z]/" ),
137            'x' => new Reg( "/^x/" ),
138            'x$' => new Reg( "/^x$/" ),
139            'i$' => new Reg( "/^i$/" ),
140            'letters' => new Reg(
141                "/^(?:[a-zA-Z\x{03B1}-\x{03C9}\x{0391}-\x{03A9}?@]|(?:\\\\(?:alpha|beta|gamma|delta|epsilon"
142                . "|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega|Gamma"
143                . "|Delta|Theta|Lambda|Xi|Pi|Sigma|Upsilon|Phi|Psi|Omega)(?:\s+|\{\}|(?![a-zA-Z]))))+/u" ),
144            '\\greek' => new Reg(
145                "/^\\\\(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi"
146                . "|rho|sigma|tau|upsilon|phi|chi|psi|omega|Gamma|Delta|Theta|Lambda|Xi|Pi|Sigma|Upsilon|Phi|Psi|Omega)"
147                . "(?:\s+|\{\}|(?![a-zA-Z]))/" ),
148            'one lowercase latin letter $' => new Reg( "/^(?:([a-z])(?:$|[^a-zA-Z]))$/" ),
149            '$one lowercase latin letter$ $' => new Reg( "/^\\\$(?:([a-z])(?:$|[^a-zA-Z]))\\\$$/" ),
150            'one lowercase greek letter $' => new Reg(
151                "/^(?:\\\$?[\x{003B1}-\x{0003C9}]\\\$?|\\\$?\\\\(?:alpha|beta|gamma|" .
152                "delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|" .
153                "phi|chi|psi|omega)\s*\\\$?)(?:\s+|\{\}|(?![a-zA-Z]))$/u" ),
154            'digits' => new Reg( "/^[0-9]+/" ),
155            '-9.,9' => new Reg( "/^[+\-]?(?:[0-9]+(?:[,.][0-9]+)?|[0-9]*(?:\.[0-9]+))/" ),
156            '-9.,9 no missing 0' => new Reg( "/^[+\-]?[0-9]+(?:[.,][0-9]+)?/" ),
157            '(-)(9.,9)(e)(99)' => static function ( $input ) {
158                $matches = [];
159                $match = preg_match( "/^(\+\-|\+\/\-|\+|\-|\\\\pm\s?)?([0-9]+(?:[,.][0-9]+)?|" .
160                    "[0-9]*(?:\.[0-9]+))?(\((?:[0-9]+(?:[,.][0-9]+)?|[0-9]*(?:\.[0-9]+))\))?(?:(?:([eE])" .
161                    "|\s*(\*|x|\\\\times|\x{00D7})\s*10\^)([+\-]?[0-9]+|\{[+\-]?[0-9]+\}))?/u", $input, $matches );
162                if ( $match && $matches[0] ) {
163                    // could also match ""
164                    return [ "match_" => array_slice( $matches, 1 ),
165                        "remainder" => substr( $input, strlen( $matches[0] ) ) ];
166                }
167                return null;
168            },
169            '(-)(9)^(-9)' => new Reg( "/^(\+\-|\+\/\-|\+|\-|\\\\pm\s?)?([0-9]+(?:[,.][0-9]+)?|"
170                . "[0-9]*(?:\.[0-9]+)?)\^([+\-]?[0-9]+|\{[+\-]?[0-9]+\})/" ),
171            'state of aggregation $' => function ( $input ) {
172                // ... or crystal system
173                $a = $this->findObserveGroups( $input, "",
174                    new Reg( "/^\([a-z]{1,3}(?=[\),])/" ), ")", "" );
175                if ( $a && preg_match( "/^($|[\s,;\)\]\}])/", $a["remainder"] ) ) {
176                    return $a;
177                }
178                $matches = [];
179                $match = preg_match( "/^(?:\((?:\\\\ca\s?)?\\\$[amothc]\\\$\))/", $input, $matches );
180                if ( $match ) {
181                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
182                }
183                return null;
184            },
185            '_{(state of aggregation)}$' => new Reg( "/^_\{(\([a-z]{1,3}\))\}/" ),
186            '{[(' => new Reg( "/^(?:\\\{|\[|\()/" ),
187            ')]}' => new Reg( "/^(?:\)|\]|\\\})/" ),
188            ', ' => new Reg( "/^[,;]\s*/" ),
189            ',' => new Reg( "/^[,;]/" ),
190            '.' => new Reg( "/^[.]/" ),
191            '. __* ' => new Reg( "/^([.\x{22C5}\x{00B7}\x{2022}]|[*])\s*/u" ),
192            '...' => new Reg( "/^\.\.\.(?=$|[^.])/" ),
193            '^{(...)}' => function ( $input ) {
194                    return $this->findObserveGroups( $input, "^{", "", "", "}" );
195            },
196            '^($...$)' => function ( $input ) {
197                    return $this->findObserveGroups( $input, "^", "$", "$", "" );
198            },
199            '^a' => new Reg( "/^\^([0-9]+|[^\\\_])/u" ),
200            '^\\x{}{}' => function ( $input ) {
201                    return $this->findObserveGroups( $input, "^",
202                        new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "", "",
203                        "{", "}", "", true );
204            },
205            '^\\x{}' => function ( $input ) {
206                    return $this->findObserveGroups( $input, "^",
207                        new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "" );
208            },
209            '^\\x' => new Reg( "/^\^(\\\\[a-zA-Z]+)\s*/" ),
210            '^(-1)' => new Reg( "/^\^(-?\d+)/" ),
211            '\'' => new Reg( "/^'/" ),
212            '_{(...)}' => function ( $input ) {
213                return $this->findObserveGroups( $input, "_{", "", "", "}" );
214            },
215            '_($...$)' => function ( $input ) {
216                return $this->findObserveGroups( $input, "_", "$", "$", "" );
217            },
218            '_9' => new Reg( "/^_([+\-]?[0-9]+|[^\\\\])/" ),
219            '_\\x{}{}' => function ( $input ) {
220                return $this->findObserveGroups( $input, "_", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
221                    "", "", "{", "}", "", true );
222            },
223            '_\\x{}' => function ( $input ) {
224                return $this->findObserveGroups( $input, "_",
225                    new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}", "" );
226            },
227            '_\\x' => new Reg( "/^_(\\\\[a-zA-Z]+)\s*/" ),
228            '^_' => new Reg( "/^(?:\^(?=_)|\_(?=\^)|[\^_]$)/" ),
229            '{}^' => new Reg( "/^\{\}(?=\^)/" ),
230            '{}' => new Reg( "/^\{\}/" ),
231            '{...}' => function ( $input ) {
232                return $this->findObserveGroups( $input, "", "{", "}", "" );
233            },
234            '{(...)}' => function ( $input ) {
235                return $this->findObserveGroups( $input, "{", "", "", "}" );
236            },
237            '$...$' => function ( $input ) {
238                return $this->findObserveGroups( $input, "", "\$", "\$", "" );
239            },
240            '${(...)}$__$(...)$' => function ( $input ) {
241                return $this->findObserveGroups( $input, "\${", "", "", "}\$" )
242                    ?? $this->findObserveGroups( $input, "\$", "", "", "\$" );
243            },
244            '=<>' => new Reg( "/^[=<>]/" ),
245            '#' => new Reg( "/^[#\x{2261}]/u" ),
246            '+' => new Reg( "/^\+/" ),
247            // -space -, -; -] -/ -$ -state-of-aggregation orig:  "/^-(?=[\s_},;\]/]|$|\([a-z]+\))/"
248            '-$' => new Reg( "/^-(?=[\s_},;\]\/]|$|\([a-z]+\))/u" ),
249            '-9' => new Reg( "/^-(?=[0-9])/" ),
250            '- orbital overlap' => new Reg( "/^-(?=(?:[spd]|sp)(?:$|[\s,;\)\]\}]))/" ),
251            '-' => new Reg( "/^-/" ),
252            'pm-operator' => new Reg( "/^(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)/" ),
253            'operator' => new Reg( "/^(?:\+|(?:[\-=<>]|<<|>>|\\\\approx|\\\$\\\\approx\\\$)(?=\s|$|-?[0-9]))/" ),
254            'arrowUpDown' => new Reg( "/^(?:v|\(v\)|\^|\(\^\))(?=$|[\s,;\)\]\}])/" ),
255            '\\bond{(...)}' => function ( $input ) {
256                return $this->findObserveGroups( $input, "\\bond{", "", "", "}" );
257            },
258            '->' => new Reg( '/^(?:<->|<-->|->|<-|<=>>|<<=>|<=>|[\x{2192}\x{27F6}\x{21CC}])/u' ),
259            'CMT' => new Reg( "/^[CMT](?=\[)/" ),
260            '[(...)]' => function ( $input ) { return $this->findObserveGroups( $input, "[", "",
261                "", "]" );
262            },
263            '1st-level escape' => new Reg( "/^(&|\\\\\\\\|\\\\hline)\s*/" ),
264            // \\x - but output no space before
265            '\\,' => new Reg( "/^(?:\\\\[,\ ;:])/" ),
266            '\\x{}{}' => function ( $input ) {
267                return $this->findObserveGroups( $input, "", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
268                    "", "", "{", "}", "", true );
269            },
270            '\\x{}' => function ( $input ) {
271                return $this->findObserveGroups( $input, "", new Reg( "/^\\\\[a-zA-Z]+\{/" ), "}",
272                    "" );
273            },
274            '\\ca' => new Reg( "/^\\\\ca(?:\s+|(?![a-zA-Z]))/" ),
275            '\\x' => new Reg( "/^(?:\\\\[a-zA-Z]+\s*|\\\\[_&{}%])/" ),
276            // only those with numbers in front, because the others will be formatted correctly anyway
277            'orbital' => new Reg( "/^(?:[0-9]{1,2}[spdfgh]|[0-9]{0,2}sp)(?=$|[^a-zA-Z])/" ),
278            'others' => new Reg( "/^[\/~|]/" ),
279            '\\frac{(...)}' => function ( $input ) {
280                    return $this->findObserveGroups( $input, "\\frac{", "",
281                        "", "}", "{", "", "", "}" );
282            },
283            '\\overset{(...)}' => function ( $input ) {
284                    return $this->findObserveGroups( $input, "\\overset{", "",
285                        "", "}", "{", "", "", "}" );
286            },
287            '\\underset{(...)}' => function ( $input ) {
288                    return $this->findObserveGroups( $input, "\\underset{", "",
289                        "", "}", "{", "", "", "}" );
290            },
291            '\\underbrace{(...)}' => function ( $input ) {
292                    return $this->findObserveGroups( $input, "\\underbrace{", "",
293                        "", "}_", "{", "", "", "}" );
294            },
295            '\\color{(...)}' => function ( $input ) {
296                    return $this->findObserveGroups( $input, "\\color{", "", "", "}" );
297            },
298            '\\color{(...)}{(...)}' => function ( $input ) {
299                // ?? instead of ||
300                return $this->findObserveGroups( $input, "\\color{", "",
301                    "", "}", "{", "", "", "}" ) ??
302                $this->findObserveGroups( $input, "\\color", "\\", "",
303                    new Reg( "/^(?=\{)/" ), "{", "", "", "}" );
304            },
305            '\\ce{(...)}' => function ( $input ) {
306                return $this->findObserveGroups( $input, "\\ce{", "", "", "}" );
307            },
308            '\\pu{(...)}' => function ( $input ) { return $this->findObserveGroups( $input,
309                "\\pu{", "", "", "}" );
310            },
311            'oxidation$' => new Reg( "/^(?:[+-][IVX]+|(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)\s*0)$/" ),
312            'd-oxidation$' => new Reg( "/^(?:[+-]?[IVX]+|(?:\\\\pm|\\\$\\\\pm\\\$|\+-|\+\/-)\s*0)$/" ),
313            '1/2$' => new Reg( "/^[+\-]?(?:[0-9]+|\\\$[a-z]\\\$|[a-z])\/[0-9]+(?:\\\$[a-z]\\\$|[a-z])?$/" ),
314            'amount' => function ( $input ) {
315                $matches = [];
316                // e.g. 2, 0.5, 1/2, -2, n/2, +;  $a$ could be added later in parsing
317                $match = preg_match( "/^(?:(?:(?:\([+\-]?[0-9]+\/[0-9]+\)|[+\-]?(?:[0-9]+|\\\$[a-z]\\\$" .
318                    "|[a-z])\/[0-9]+|[+\-]?[0-9]+[.,][0-9]+|[+\-]?\.[0-9]+|[+\-]?[0-9]+)(?:[a-z](?=\s*[A-Z]))?)" .
319                    "|[+\-]?[a-z](?=\s*[A-Z])|\+(?!\s))/", $input, $matches );
320
321                if ( $match ) {
322                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
323                }
324                $a = $this->findObserveGroups( $input, "", "$", "$", "" );
325                // e.g. $2n-1$, $-$
326                if ( MhchemUtil::issetJS( $a ) ) {
327                    $matchesI = [];
328
329                    $match = preg_match( "/^\\\$(?:\(?[+\-]?(?:[0-9]*[a-z]?[+\-])" .
330                        "?[0-9]*[a-z](?:[+\-][0-9]*[a-z]?)?\)?|\+|-)\\\$$/", $a["match_"] ?? "",
331                        $matchesI );
332                    if ( $match ) {
333                        return [ "match_" => $matchesI[0], "remainder" => substr( $input, strlen( $matchesI[0] ) ) ];
334                    }
335                }
336                return null;
337            },
338            'amount2' => function ( $input ) {
339                /* @phan-suppress-next-line PhanInfiniteRecursion, PhanUndeclaredInvokeInCallable */
340                return $this->patterns['amount']( $input );
341            },
342            '(KV letters),' => new Reg( "/^(?:[A-Z][a-z]{0,2}|i)(?=,)/" ),
343            'formula$' => static function ( $input ) {
344                if ( preg_match( "/^\([a-z]+\)$/", $input ) ) {
345                    // state of aggregation = no formula
346                    return null;
347                }
348                $matches = [];
349                $match = preg_match( "/^(?:[a-z]|(?:[0-9\ \+\-\,\.\(\)]+[a-z])+[0-9\ \+\-\,\.\(\)]*|"
350                    . "(?:[a-z][0-9\ \+\-\,\.\(\)]+)+[a-z]?)$/", $input, $matches );
351                if ( $match ) {
352                    return [ "match_" => $matches[0], "remainder" => substr( $input, strlen( $matches[0] ) ) ];
353                }
354                return null;
355            },
356            'uprightEntities' => new Reg( "/^(?:pH|pOH|pC|pK|iPr|iBu)(?=$|[^a-zA-Z])/" ),
357            '/' => new Reg( "/^\s*(\/)\s*/" ),
358            '//' => new Reg( "/^\s*(\/\/)\s*/" ),
359            '*' => new Reg( "/^\s*[*.]\s*/" )
360        ];
361    }
362
363    /**
364     * Matching function
365     * e.g. match("a", input) will look for the regexp called "a" and see if it matches
366     * returns null or {match_:"a", remainder:"bc"}
367     * @param string $m key for fetching a pattern
368     * @param string $input string to check
369     * @return array|mixed|null information about the match
370     */
371    public function match( string $m, string $input ) {
372        $pattern = $this->patterns[$m] ?? null;
373        if ( !$pattern ) {
374            // Trying to use non-existing pattern
375            throw new RuntimeException( "MhchemBugP: mhchem bug P. Please report. (" . $m . ")" );
376        } elseif ( $pattern instanceof Reg ) {
377            $matches = [];
378            $match = preg_match( $pattern->getRegExp(), $input, $matches );
379            if ( $match ) {
380                if ( count( $matches ) > 2 ) {
381                    return [
382                        "match_" => array_slice( $matches, 1 ),
383                        "remainder" => substr( $input, strlen( $matches[0] ) )
384                    ];
385
386                } else {
387                    return [
388                        "match_" => MhchemUtil::issetJS( $matches[1] ?? null ) ? $matches[1] : $matches[0],
389                        "remainder" => substr( $input, strlen( $matches[0] ) )
390                    ];
391                }
392            }
393            return null;
394        } elseif ( is_callable( $pattern ) ) {
395            // $pattern cannot be an instance of MhchemRegExp here, which causes this warning.
396            /* @phan-suppress-next-line PhanUndeclaredInvokeInCallable */
397            return $this->patterns[$m]( $input );
398        } else {
399            return null;
400        }
401    }
402}