Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 40 |
|
0.00% |
0 / 7 |
CRAP | |
0.00% |
0 / 1 |
HieroTokenizer | |
0.00% |
0 / 40 |
|
0.00% |
0 / 7 |
342 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tokenize | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
56 | |||
newBlock | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
newToken | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
singleCharBlock | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
dot | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
char | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | /** |
3 | * Copyright (C) 2004 Guillaume Blanchard (Aoineko) |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | */ |
20 | |
21 | namespace WikiHiero; |
22 | |
23 | /** |
24 | * Hieroglyphs tokenizer |
25 | */ |
26 | class HieroTokenizer { |
27 | |
28 | private const DELIMITERS = " -\t\n\r"; |
29 | private const TOKEN_DELIMITERS = '*:()'; |
30 | private const SINGLE_CHAR_DELIMITER = '!'; |
31 | |
32 | /** @var string */ |
33 | private $text; |
34 | /** @var string[][]|false */ |
35 | private $blocks = false; |
36 | /** @var string[] */ |
37 | private $currentBlock; |
38 | /** @var string */ |
39 | private $token; |
40 | |
41 | /** |
42 | * Constructor |
43 | * |
44 | * @param string $text |
45 | */ |
46 | public function __construct( $text ) { |
47 | $this->text = $text; |
48 | } |
49 | |
50 | /** |
51 | * Split text into blocks, then split blocks into items |
52 | * |
53 | * @return string[][] tokenized text |
54 | * |
55 | * @suppress PhanParamSuspiciousOrder |
56 | */ |
57 | public function tokenize() { |
58 | if ( $this->blocks !== false ) { |
59 | return $this->blocks; |
60 | } |
61 | |
62 | $this->blocks = []; |
63 | $this->currentBlock = []; |
64 | $this->token = ''; |
65 | |
66 | // remove HTML comments |
67 | $text = preg_replace( '/\\<!--.*?--\\>/s', '', $this->text ); |
68 | |
69 | for ( $i = 0, $len = strlen( $text ); $i < $len; $i++ ) { |
70 | $char = $text[$i]; |
71 | |
72 | if ( strpos( self::DELIMITERS, $char ) !== false ) { |
73 | $this->newBlock(); |
74 | } elseif ( $char === self::SINGLE_CHAR_DELIMITER ) { |
75 | $this->singleCharBlock( $char ); |
76 | } elseif ( $char == '.' ) { |
77 | $this->dot(); |
78 | } elseif ( strpos( self::TOKEN_DELIMITERS, $char ) !== false ) { |
79 | $this->newToken( $char ); |
80 | } else { |
81 | $this->char( $char ); |
82 | } |
83 | } |
84 | |
85 | // flush stuff being processed |
86 | $this->newBlock(); |
87 | |
88 | return $this->blocks; |
89 | } |
90 | |
91 | /** |
92 | * Handles a block delimiter |
93 | */ |
94 | private function newBlock() { |
95 | $this->newToken(); |
96 | if ( $this->currentBlock ) { |
97 | $this->blocks[] = $this->currentBlock; |
98 | $this->currentBlock = []; |
99 | } |
100 | } |
101 | |
102 | /** |
103 | * Flushes current token, optionally adds another one |
104 | * |
105 | * @param string|bool $token token to add or false |
106 | */ |
107 | private function newToken( $token = false ) { |
108 | if ( $this->token !== '' ) { |
109 | $this->currentBlock[] = $this->token; |
110 | $this->token = ''; |
111 | } |
112 | if ( $token !== false ) { |
113 | $this->currentBlock[] = $token; |
114 | } |
115 | } |
116 | |
117 | /** |
118 | * Adds a block consisting of one character |
119 | * |
120 | * @param string $char block character |
121 | */ |
122 | private function singleCharBlock( $char ) { |
123 | $this->newBlock(); |
124 | $this->blocks[] = [ $char ]; |
125 | } |
126 | |
127 | /** |
128 | * Handles void blocks represented by dots |
129 | */ |
130 | private function dot() { |
131 | if ( $this->token == '.' ) { |
132 | $this->token = '..'; |
133 | $this->newBlock(); |
134 | } else { |
135 | $this->newBlock(); |
136 | $this->token = '.'; |
137 | } |
138 | } |
139 | |
140 | /** |
141 | * Adds a miscellaneous character to current token |
142 | * |
143 | * @param string $char character to add |
144 | */ |
145 | private function char( $char ) { |
146 | if ( $this->token == '.' ) { |
147 | $this->newBlock(); |
148 | $this->token = $char; |
149 | } else { |
150 | $this->token .= $char; |
151 | } |
152 | } |
153 | } |