Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
41.82% covered (danger)
41.82%
46 / 110
0.00% covered (danger)
0.00%
0 / 2
CRAP
0.00% covered (danger)
0.00%
0 / 1
AddRedLinks
41.82% covered (danger)
41.82%
46 / 110
0.00% covered (danger)
0.00%
0 / 2
291.25
0.00% covered (danger)
0.00%
0 / 1
 run
54.12% covered (warning)
54.12%
46 / 85
0.00% covered (danger)
0.00%
0 / 1
74.10
 getVariantTitles
0.00% covered (danger)
0.00%
0 / 25
0.00% covered (danger)
0.00%
0 / 1
182
1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors;
5
6use DOMDocument;
7use Wikimedia\Parsoid\Config\Env;
8use Wikimedia\Parsoid\DOM\DocumentFragment;
9use Wikimedia\Parsoid\DOM\Element;
10use Wikimedia\Parsoid\DOM\Node;
11use Wikimedia\Parsoid\Language\LanguageConverter;
12use Wikimedia\Parsoid\Utils\DOMCompat;
13use Wikimedia\Parsoid\Utils\DOMDataUtils;
14use Wikimedia\Parsoid\Utils\PHPUtils;
15use Wikimedia\Parsoid\Utils\UrlUtils;
16use Wikimedia\Parsoid\Utils\WTUtils;
17use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor;
18
19class AddRedLinks implements Wt2HtmlDOMProcessor {
20
21    /**
22     * Batch size to use for fetching page data to avoid exceeding LinkCache::MAX_SIZE
23     */
24    private const LINK_BATCH_SIZE = 1000;
25
26    /**
27     * Add red links to a document.
28     *
29     * @inheritDoc
30     */
31    public function run(
32        Env $env, Node $root, array $options = [], bool $atTopLevel = false
33    ): void {
34        '@phan-var Element|DocumentFragment $root';  // @var Element|DocumentFragment $root
35        $allLinks = PHPUtils::iterable_to_array(
36            DOMCompat::querySelectorAll( $root, 'a[rel~="mw:WikiLink"]' )
37        );
38
39        // Split up processing into chunks of 1000 so that we don't exceed LinkCache::MAX_SIZE
40        $chunks = array_chunk( $allLinks, self::LINK_BATCH_SIZE );
41        foreach ( $chunks as $links ) {
42            $titles = [];
43            foreach ( $links as $a ) {
44                $t = DOMCompat::getAttribute( $a, 'title' );
45                if ( $t !== null ) {
46                    $titles[$t] = true;
47                }
48            }
49
50            if ( !$titles ) {
51                return;
52            }
53
54            $start = microtime( true );
55            $titleMap = $env->getDataAccess()->getPageInfo( $env->getPageConfig(), array_keys( $titles ) );
56            if ( $env->profiling() ) {
57                $profile = $env->getCurrentProfile();
58                $profile->bumpMWTime( "RedLinks", 1000 * ( microtime( true ) - $start ), "api" );
59                $profile->bumpCount( "RedLinks" );
60            }
61
62            $prefixedTitleText = $env->getContextTitle()->getPrefixedText();
63
64            $variantMap = $this->getVariantTitles(
65                $env,
66                $root->ownerDocument,
67                $titles,
68                $titleMap
69            );
70
71            foreach ( $links as $a ) {
72                $k = DOMCompat::getAttribute( $a, 'title' );
73                if ( $k === null ) {
74                    continue;
75                }
76
77                $variantData = $variantMap[$k] ?? null;
78                $data = $variantData ?? $titleMap[$k] ?? null;
79
80                if ( $data === null ) {
81                    // Likely a consequence of T237535; can be removed once
82                    // that is fixed.
83                    $env->log( 'warn', 'We should have data for the title: ' . $k );
84                    continue;
85                }
86
87                // Convert links pointing to a variant title (T258856)
88                if ( $variantData !== null ) {
89                    $variantTitle = $env->makeTitleFromURLDecodedStr(
90                        $variantData['variantTitle']
91                    );
92
93                    $origHref = DOMCompat::getAttribute( $a, 'href' );
94                    $origUrl = UrlUtils::parseUrl( $origHref ?? '' );
95
96                    $newUrl = UrlUtils::parseUrl( $env->makeLink( $variantTitle ) );
97                    $newUrl['query'] = $origUrl['query'];
98                    $newUrl['fragment'] = $origUrl['fragment'];
99
100                    $variantPrefixedText = $variantTitle->getPrefixedText();
101                    DOMDataUtils::addNormalizedAttribute(
102                        $a, 'title', $variantPrefixedText, $k
103                    );
104                    // Set $k to the new title for the selflink check below.
105                    // Note that getVariantTitles doesn't set $variantData for
106                    // missing titles, so we won't be in this block for the
107                    // red-link-title case below.
108                    $k = $variantPrefixedText;
109
110                    DOMDataUtils::addNormalizedAttribute(
111                        $a,
112                        'href',
113                        UrlUtils::assembleUrl( $newUrl ),
114                        $origHref,
115                        // Ensure we preserve the real original value
116                        // added during initial link parsing.
117                        true
118                    );
119                }
120
121                $a->removeAttribute( 'class' ); // Clear all, if we're doing a pb2pb refresh
122
123                $href = DOMCompat::getAttribute( $a, 'href' );
124                $parsedURL = UrlUtils::parseUrl( $href ?? '' );
125
126                $queryElts = [];
127                if ( isset( $parsedURL['query'] ) ) {
128                    parse_str( $parsedURL['query'], $queryElts );
129                }
130
131                if (
132                    !empty( $data['missing'] ) && empty( $data['known'] ) &&
133                    $k !== $prefixedTitleText
134                ) {
135                    DOMCompat::getClassList( $a )->add( 'new' );
136                    WTUtils::addPageContentI18nAttribute( $a, 'title', 'red-link-title', [ $k ] );
137                    $queryElts['action'] = 'edit';
138                    $queryElts['redlink'] = '1';
139                } else {
140                    if ( $k === $prefixedTitleText ) {
141                        if ( isset( $parsedURL['fragment'] ) ) {
142                            DOMCompat::getClassList( $a )->add( 'mw-selflink-fragment' );
143                        } else {
144                            DOMCompat::getClassList( $a )->add( 'mw-selflink', 'selflink' );
145                        }
146                        $a->removeAttribute( 'title' );
147                    }
148                    // Clear a potential redlink, if we're doing a pb2pb refresh
149                    // This is similar to what's happening in Html2Wt/RemoveRedLinks
150                    // and maybe that pass should just run before this one.
151                    if ( isset( $queryElts['action'] ) && $queryElts['action'] === 'edit' ) {
152                        unset( $queryElts['action'] );
153                    }
154                    if ( isset( $queryElts['redlink'] ) && $queryElts['redlink'] === '1' ) {
155                        unset( $queryElts['redlink'] );
156                    }
157                }
158
159                if ( count( $queryElts ) === 0 ) {
160                    // avoids the insertion of ? on empty query string
161                    $parsedURL['query'] = null;
162                } else {
163                    $parsedURL['query'] = http_build_query( $queryElts );
164                }
165                $newHref = UrlUtils::assembleUrl( $parsedURL );
166
167                $a->setAttribute( 'href', $newHref );
168
169                if ( !empty( $data['redirect'] ) ) {
170                    DOMCompat::getClassList( $a )->add( 'mw-redirect' );
171                }
172                foreach ( $data['linkclasses'] ?? [] as $extraClass ) {
173                    DOMCompat::getClassList( $a )->add( $extraClass );
174                }
175            }
176        }
177    }
178
179    /**
180     * Attempt to resolve nonexistent link targets using their variants (T258856)
181     *
182     * @param Env $env
183     * @param DOMDocument $doc
184     * @param array $titles map keyed by page titles
185     * @param array $titleMap map of resolved page data keyed by title
186     * @return array map of resolved variant page data keyed by original title
187     */
188    private function getVariantTitles(
189        Env $env,
190        DOMDocument $doc,
191        array $titles,
192        array $titleMap
193    ): array {
194        // Optimize for the common case where the page language has no variants
195        if ( !$env->langConverterEnabled() ) {
196            return [];
197        }
198
199        $origsByVariant = [];
200
201        // Gather all nonexistent page titles to search for their variants
202        foreach ( array_keys( $titles ) as $title ) {
203            if (
204                // T237535
205                isset( $titleMap[$title] ) &&
206                ( empty( $titleMap[$title]['missing'] ) || !empty( $titleMap[$title]['known'] ) )
207            ) {
208                continue;
209            }
210
211            // array_keys converts strings representing numbers to ints.
212            // So, cast $title to string explicitly.
213            $variantTitles = LanguageConverter::autoConvertToAllVariants( $env, $doc, (string)$title );
214
215            foreach ( $variantTitles as $variantTitle ) {
216                $origsByVariant[$variantTitle][] = $title;
217            }
218        }
219
220        $variantsByOrig = [];
221        $variantTitles = array_keys( $origsByVariant );
222
223        foreach ( array_chunk( $variantTitles, self::LINK_BATCH_SIZE ) as $variantChunk ) {
224            $variantChunkData = $env->getDataAccess()->getPageInfo(
225                $env->getPageConfig(),
226                $variantChunk
227            );
228
229            // Map resolved variant titles to their corresponding originals
230            foreach ( $variantChunkData as $variantTitle => $pageData ) {
231                // Handle invalid titles
232                // For example, a conversion might result in a title that's too long.
233                if ( !empty( $pageData['invalid'] ) ) {
234                    continue;
235                }
236
237                // Handle non-existent variant titles
238                if ( !empty( $pageData['missing'] ) && empty( $pageData['known'] ) ) {
239                    continue;
240                }
241
242                foreach ( $origsByVariant[$variantTitle] as $origTitle ) {
243                    $variantsByOrig[$origTitle] = [ 'variantTitle' => $variantTitle ] + $pageData;
244                }
245            }
246        }
247
248        return $variantsByOrig;
249    }
250}