Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
41.82% |
46 / 110 |
|
0.00% |
0 / 2 |
CRAP | |
0.00% |
0 / 1 |
AddRedLinks | |
41.82% |
46 / 110 |
|
0.00% |
0 / 2 |
291.25 | |
0.00% |
0 / 1 |
run | |
54.12% |
46 / 85 |
|
0.00% |
0 / 1 |
74.10 | |||
getVariantTitles | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
182 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
5 | |
6 | use DOMDocument; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\Language\LanguageConverter; |
12 | use Wikimedia\Parsoid\Utils\DOMCompat; |
13 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
14 | use Wikimedia\Parsoid\Utils\PHPUtils; |
15 | use Wikimedia\Parsoid\Utils\UrlUtils; |
16 | use Wikimedia\Parsoid\Utils\WTUtils; |
17 | use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; |
18 | |
19 | class AddRedLinks implements Wt2HtmlDOMProcessor { |
20 | |
21 | /** |
22 | * Batch size to use for fetching page data to avoid exceeding LinkCache::MAX_SIZE |
23 | */ |
24 | private const LINK_BATCH_SIZE = 1000; |
25 | |
26 | /** |
27 | * Add red links to a document. |
28 | * |
29 | * @inheritDoc |
30 | */ |
31 | public function run( |
32 | Env $env, Node $root, array $options = [], bool $atTopLevel = false |
33 | ): void { |
34 | '@phan-var Element|DocumentFragment $root'; // @var Element|DocumentFragment $root |
35 | $allLinks = PHPUtils::iterable_to_array( |
36 | DOMCompat::querySelectorAll( $root, 'a[rel~="mw:WikiLink"]' ) |
37 | ); |
38 | |
39 | // Split up processing into chunks of 1000 so that we don't exceed LinkCache::MAX_SIZE |
40 | $chunks = array_chunk( $allLinks, self::LINK_BATCH_SIZE ); |
41 | foreach ( $chunks as $links ) { |
42 | $titles = []; |
43 | foreach ( $links as $a ) { |
44 | $t = DOMCompat::getAttribute( $a, 'title' ); |
45 | if ( $t !== null ) { |
46 | $titles[$t] = true; |
47 | } |
48 | } |
49 | |
50 | if ( !$titles ) { |
51 | return; |
52 | } |
53 | |
54 | $start = microtime( true ); |
55 | $titleMap = $env->getDataAccess()->getPageInfo( $env->getPageConfig(), array_keys( $titles ) ); |
56 | if ( $env->profiling() ) { |
57 | $profile = $env->getCurrentProfile(); |
58 | $profile->bumpMWTime( "RedLinks", 1000 * ( microtime( true ) - $start ), "api" ); |
59 | $profile->bumpCount( "RedLinks" ); |
60 | } |
61 | |
62 | $prefixedTitleText = $env->getContextTitle()->getPrefixedText(); |
63 | |
64 | $variantMap = $this->getVariantTitles( |
65 | $env, |
66 | $root->ownerDocument, |
67 | $titles, |
68 | $titleMap |
69 | ); |
70 | |
71 | foreach ( $links as $a ) { |
72 | $k = DOMCompat::getAttribute( $a, 'title' ); |
73 | if ( $k === null ) { |
74 | continue; |
75 | } |
76 | |
77 | $variantData = $variantMap[$k] ?? null; |
78 | $data = $variantData ?? $titleMap[$k] ?? null; |
79 | |
80 | if ( $data === null ) { |
81 | // Likely a consequence of T237535; can be removed once |
82 | // that is fixed. |
83 | $env->log( 'warn', 'We should have data for the title: ' . $k ); |
84 | continue; |
85 | } |
86 | |
87 | // Convert links pointing to a variant title (T258856) |
88 | if ( $variantData !== null ) { |
89 | $variantTitle = $env->makeTitleFromURLDecodedStr( |
90 | $variantData['variantTitle'] |
91 | ); |
92 | |
93 | $origHref = DOMCompat::getAttribute( $a, 'href' ); |
94 | $origUrl = UrlUtils::parseUrl( $origHref ?? '' ); |
95 | |
96 | $newUrl = UrlUtils::parseUrl( $env->makeLink( $variantTitle ) ); |
97 | $newUrl['query'] = $origUrl['query']; |
98 | $newUrl['fragment'] = $origUrl['fragment']; |
99 | |
100 | $variantPrefixedText = $variantTitle->getPrefixedText(); |
101 | DOMDataUtils::addNormalizedAttribute( |
102 | $a, 'title', $variantPrefixedText, $k |
103 | ); |
104 | // Set $k to the new title for the selflink check below. |
105 | // Note that getVariantTitles doesn't set $variantData for |
106 | // missing titles, so we won't be in this block for the |
107 | // red-link-title case below. |
108 | $k = $variantPrefixedText; |
109 | |
110 | DOMDataUtils::addNormalizedAttribute( |
111 | $a, |
112 | 'href', |
113 | UrlUtils::assembleUrl( $newUrl ), |
114 | $origHref, |
115 | // Ensure we preserve the real original value |
116 | // added during initial link parsing. |
117 | true |
118 | ); |
119 | } |
120 | |
121 | $a->removeAttribute( 'class' ); // Clear all, if we're doing a pb2pb refresh |
122 | |
123 | $href = DOMCompat::getAttribute( $a, 'href' ); |
124 | $parsedURL = UrlUtils::parseUrl( $href ?? '' ); |
125 | |
126 | $queryElts = []; |
127 | if ( isset( $parsedURL['query'] ) ) { |
128 | parse_str( $parsedURL['query'], $queryElts ); |
129 | } |
130 | |
131 | if ( |
132 | !empty( $data['missing'] ) && empty( $data['known'] ) && |
133 | $k !== $prefixedTitleText |
134 | ) { |
135 | DOMCompat::getClassList( $a )->add( 'new' ); |
136 | WTUtils::addPageContentI18nAttribute( $a, 'title', 'red-link-title', [ $k ] ); |
137 | $queryElts['action'] = 'edit'; |
138 | $queryElts['redlink'] = '1'; |
139 | } else { |
140 | if ( $k === $prefixedTitleText ) { |
141 | if ( isset( $parsedURL['fragment'] ) ) { |
142 | DOMCompat::getClassList( $a )->add( 'mw-selflink-fragment' ); |
143 | } else { |
144 | DOMCompat::getClassList( $a )->add( 'mw-selflink', 'selflink' ); |
145 | } |
146 | $a->removeAttribute( 'title' ); |
147 | } |
148 | // Clear a potential redlink, if we're doing a pb2pb refresh |
149 | // This is similar to what's happening in Html2Wt/RemoveRedLinks |
150 | // and maybe that pass should just run before this one. |
151 | if ( isset( $queryElts['action'] ) && $queryElts['action'] === 'edit' ) { |
152 | unset( $queryElts['action'] ); |
153 | } |
154 | if ( isset( $queryElts['redlink'] ) && $queryElts['redlink'] === '1' ) { |
155 | unset( $queryElts['redlink'] ); |
156 | } |
157 | } |
158 | |
159 | if ( count( $queryElts ) === 0 ) { |
160 | // avoids the insertion of ? on empty query string |
161 | $parsedURL['query'] = null; |
162 | } else { |
163 | $parsedURL['query'] = http_build_query( $queryElts ); |
164 | } |
165 | $newHref = UrlUtils::assembleUrl( $parsedURL ); |
166 | |
167 | $a->setAttribute( 'href', $newHref ); |
168 | |
169 | if ( !empty( $data['redirect'] ) ) { |
170 | DOMCompat::getClassList( $a )->add( 'mw-redirect' ); |
171 | } |
172 | foreach ( $data['linkclasses'] ?? [] as $extraClass ) { |
173 | DOMCompat::getClassList( $a )->add( $extraClass ); |
174 | } |
175 | } |
176 | } |
177 | } |
178 | |
179 | /** |
180 | * Attempt to resolve nonexistent link targets using their variants (T258856) |
181 | * |
182 | * @param Env $env |
183 | * @param DOMDocument $doc |
184 | * @param array $titles map keyed by page titles |
185 | * @param array $titleMap map of resolved page data keyed by title |
186 | * @return array map of resolved variant page data keyed by original title |
187 | */ |
188 | private function getVariantTitles( |
189 | Env $env, |
190 | DOMDocument $doc, |
191 | array $titles, |
192 | array $titleMap |
193 | ): array { |
194 | // Optimize for the common case where the page language has no variants |
195 | if ( !$env->langConverterEnabled() ) { |
196 | return []; |
197 | } |
198 | |
199 | $origsByVariant = []; |
200 | |
201 | // Gather all nonexistent page titles to search for their variants |
202 | foreach ( array_keys( $titles ) as $title ) { |
203 | if ( |
204 | // T237535 |
205 | isset( $titleMap[$title] ) && |
206 | ( empty( $titleMap[$title]['missing'] ) || !empty( $titleMap[$title]['known'] ) ) |
207 | ) { |
208 | continue; |
209 | } |
210 | |
211 | // array_keys converts strings representing numbers to ints. |
212 | // So, cast $title to string explicitly. |
213 | $variantTitles = LanguageConverter::autoConvertToAllVariants( $env, $doc, (string)$title ); |
214 | |
215 | foreach ( $variantTitles as $variantTitle ) { |
216 | $origsByVariant[$variantTitle][] = $title; |
217 | } |
218 | } |
219 | |
220 | $variantsByOrig = []; |
221 | $variantTitles = array_keys( $origsByVariant ); |
222 | |
223 | foreach ( array_chunk( $variantTitles, self::LINK_BATCH_SIZE ) as $variantChunk ) { |
224 | $variantChunkData = $env->getDataAccess()->getPageInfo( |
225 | $env->getPageConfig(), |
226 | $variantChunk |
227 | ); |
228 | |
229 | // Map resolved variant titles to their corresponding originals |
230 | foreach ( $variantChunkData as $variantTitle => $pageData ) { |
231 | // Handle invalid titles |
232 | // For example, a conversion might result in a title that's too long. |
233 | if ( !empty( $pageData['invalid'] ) ) { |
234 | continue; |
235 | } |
236 | |
237 | // Handle non-existent variant titles |
238 | if ( !empty( $pageData['missing'] ) && empty( $pageData['known'] ) ) { |
239 | continue; |
240 | } |
241 | |
242 | foreach ( $origsByVariant[$variantTitle] as $origTitle ) { |
243 | $variantsByOrig[$origTitle] = [ 'variantTitle' => (string)$variantTitle ] + $pageData; |
244 | } |
245 | } |
246 | } |
247 | |
248 | return $variantsByOrig; |
249 | } |
250 | } |