Code Coverage for /workspace/src/extensions/Translate/src/PageTranslation/TranslatablePageParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	77.98% covered (warning)	77.98%	85 / 109	75.00% covered (warning)	75.00%	6 / 8	CRAP	0.00% covered (danger)	0.00%	0 / 1
TranslatablePageParser	77.98% covered (warning)	77.98%	85 / 109	75.00% covered (warning)	75.00%	6 / 8	28.65	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
containsMarkup	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
cleanupTags	100.00% covered (success)	100.00%	11 / 11	100.00% covered (success)	100.00%	1 / 1	1
parse	71.43% covered (warning)	71.43%	30 / 42	0.00% covered (danger)	0.00%	0 / 1	9.49
parseSection	100.00% covered (success)	100.00%	18 / 18	100.00% covered (success)	100.00%	1 / 1	3
parseUnit	55.56% covered (warning)	55.56%	15 / 27	0.00% covered (danger)	0.00%	0 / 1	9.16
armourNowiki	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
unarmourNowiki	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1

1	<?php
2	declare( strict_types = 1 );
3
4	namespace MediaWiki\Extension\Translate\PageTranslation;
5
6	use MediaWiki\Extension\Translate\Utilities\ParsingPlaceholderFactory;
7
8	/**
9	* Generates ParserOutput from text or removes all tags from a text.
10	*
11	* @author Niklas Laxström
12	* @license GPL-2.0-or-later
13	* @since 2020.08
14	*/
15	class TranslatablePageParser {
16	private $placeholderFactory;
17
18	public function __construct( ParsingPlaceholderFactory $placeholderFactory ) {
19	$this->placeholderFactory = $placeholderFactory;
20	}
21
22	public function containsMarkup( string $text ): bool {
23	$nowiki = [];
24	$text = $this->armourNowiki( $nowiki, $text );
25	return preg_match( '~</?translate[ >]~', $text ) !== 0;
26	}
27
28	/**
29	* Remove all opening and closing translate tags following the same whitespace rules as the
30	* regular parsing. This doesn't try to parse the page, so it can handle unbalanced tags.
31	*/
32	public function cleanupTags( string $text ): string {
33	$nowiki = [];
34	$text = $this->armourNowiki( $nowiki, $text );
35	$text = preg_replace( '~<translate( nowrap)?>\n?~s', '', $text );
36	$text = preg_replace( '~\n?</translate>~s', '', $text );
37	// Markers: headers and the rest
38	$ic = preg_quote( TranslationUnit::UNIT_MARKER_INVALID_CHARS, '~' );
39	$text = preg_replace( "~(^=.*=) <!--T:[^$ic]+-->$~um", '\1', $text );
40	$text = preg_replace( "~<!--T:[^$ic]+-->[\n ]?~um", '', $text );
41	// Remove variables
42	$unit = new TranslationUnit( $text );
43	$text = $unit->getTextForTrans();
44
45	$text = $this->unarmourNowiki( $nowiki, $text );
46	return $text;
47	}
48
49	/** @throws ParsingFailure */
50	public function parse( string $text ): ParserOutput {
51	$nowiki = [];
52	$text = $this->armourNowiki( $nowiki, $text );
53
54	$sections = [];
55	$tagPlaceHolders = [];
56
57	while ( true ) {
58	$re = '~(<translate(?: nowrap)?>)(.*?)</translate>~s';
59	$matches = [];
60	$ok = preg_match( $re, $text, $matches, PREG_OFFSET_CAPTURE );
61
62	if ( $ok === 0 \|\| $ok === false ) {
63	break; // No match or failure
64	}
65
66	$contentWithTags = $matches[0][0];
67	$contentWithoutTags = $matches[2][0];
68	// These are offsets to the content inside the tags in $text
69	$offsetStart = $matches[0][1];
70	$offsetEnd = $offsetStart + strlen( $contentWithTags );
71
72	// Replace the whole match with a placeholder
73	$ph = $this->placeholderFactory->make();
74	$text = substr( $text, 0, $offsetStart ) . $ph . substr( $text, $offsetEnd );
75
76	if ( preg_match( '~<translate( nowrap)?>~', $contentWithoutTags ) !== 0 ) {
77	throw new ParsingFailure(
78	'Nested tags',
79	[ 'pt-parse-nested', $contentWithoutTags ]
80	);
81	}
82
83	$openTag = $matches[1][0];
84	$canWrap = $openTag !== '<translate nowrap>';
85
86	// Parse the content inside the tags
87	$contentWithoutTags = $this->unarmourNowiki( $nowiki, $contentWithoutTags );
88	$parse = $this->parseSection( $contentWithoutTags, $canWrap );
89
90	// Update list of sections and the template with the results
91	$sections += $parse['sections'];
92	$tagPlaceHolders[$ph] = new Section( $openTag, $parse['template'], '</translate>' );
93	}
94
95	$prettyTemplate = $text;
96	foreach ( $tagPlaceHolders as $ph => $value ) {
97	$prettyTemplate = str_replace( $ph, '[...]', $prettyTemplate );
98	}
99
100	if ( preg_match( '~<translate( nowrap)?>~', $text ) !== 0 ) {
101	throw new ParsingFailure(
102	'Unmatched opening tag',
103	[ 'pt-parse-open', $prettyTemplate ]
104	);
105	} elseif ( str_contains( $text, '</translate>' ) ) {
106	throw new ParsingFailure(
107	"Unmatched closing tag",
108	[ 'pt-parse-close', $prettyTemplate ]
109	);
110	}
111
112	$text = $this->unarmourNowiki( $nowiki, $text );
113
114	return new ParserOutput( $text, $tagPlaceHolders, $sections );
115	}
116
117	/**
118	* Splits the content marked with \<translate> tags into translation units, which are
119	* separated with two or more newlines. Extra whitespace is captured in the template and
120	* is not included in the translation units.
121	* @internal
122	*/
123	public function parseSection( string $text, bool $canWrap ): array {
124	$flags = PREG_SPLIT_NO_EMPTY \| PREG_SPLIT_DELIM_CAPTURE;
125	$parts = preg_split( '~(^\s\|\s\n\n\s\|\s$)~', $text, -1, $flags );
126
127	$inline = preg_match( '~\n~', $text ) === 0;
128
129	$template = '';
130	$sections = [];
131
132	foreach ( $parts as $_ ) {
133	if ( trim( $_ ) === '' ) {
134	$template .= $_;
135	} else {
136	$ph = $this->placeholderFactory->make();
137	$tpsection = $this->parseUnit( $_ );
138	$tpsection->setIsInline( $inline );
139	$tpsection->setCanWrap( $canWrap );
140	$sections[$ph] = $tpsection;
141	$template .= $ph;
142	}
143	}
144
145	return [
146	'template' => $template,
147	'sections' => $sections,
148	];
149	}
150
151	/**
152	* Checks if this unit already contains a section marker. If there
153	* is not, a new one will be created. Marker will have the value of
154	* -1, which will later be replaced with a real value.
155	* @internal
156	*/
157	public function parseUnit( string $content ): TranslationUnit {
158	$re = '~<!--T:(.*?)-->~';
159	$matches = [];
160	$count = preg_match_all( $re, $content, $matches, PREG_SET_ORDER );
161
162	if ( $count > 1 ) {
163	throw new ParsingFailure(
164	'Multiple translation unit markers',
165	[ 'pt-shake-multiple', $content ]
166	);
167	}
168
169	// If no id given in the source, default to a new section id
170	$id = TranslationUnit::NEW_UNIT_ID;
171	if ( $count === 1 ) {
172	foreach ( $matches as $match ) {
173	[ /full/, $id ] = $match;
174
175	// Currently handle only these two standard places.
176	// Is this too strict?
177	$rer1 = '~^<!--T:(.*?)-->( \|\n)~'; // Normal sections
178	$rer2 = '~\s<!--T:(.?)-->$~m'; // Sections with title
179	$content = preg_replace( $rer1, '', $content );
180	$content = preg_replace( $rer2, '', $content );
181
182	if ( preg_match( $re, $content ) === 1 ) {
183	throw new ParsingFailure(
184	'Translation unit marker is in unsupported position',
185	[ 'pt-shake-position', $content ]
186	);
187	} elseif ( trim( $content ) === '' ) {
188	throw new ParsingFailure(
189	'Translation unit has no content besides marker',
190	[ 'pt-shake-empty', $id ]
191	);
192	}
193	}
194	}
195
196	return new TranslationUnit( $content, $id );
197	}
198
199	/** @internal */
200	public function armourNowiki( array &$holders, string $text ): string {
201	$re = '~(<nowiki>)(.*?)(</nowiki>)~s';
202
203	while ( preg_match( $re, $text, $matches ) ) {
204	$ph = $this->placeholderFactory->make();
205	$text = str_replace( $matches[0], $ph, $text );
206	$holders[$ph] = $matches[0];
207	}
208
209	return $text;
210	}
211
212	/** @internal */
213	public function unarmourNowiki( array $holders, string $text ): string {
214	return strtr( $text, $holders );
215	}
216	}