Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 126 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
| ComputeSelectiveStats | |
0.00% |
0 / 126 |
|
0.00% |
0 / 5 |
1260 | |
0.00% |
0 / 1 |
| classify | |
0.00% |
0 / 88 |
|
0.00% |
0 / 1 |
506 | |||
| pc2wt | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| filterUserAgent | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
30 | |||
| bool2str | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
| int2str | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Utils; |
| 5 | |
| 6 | use Wikimedia\Parsoid\Config\Env; |
| 7 | use Wikimedia\Parsoid\Config\PageConfig; |
| 8 | use Wikimedia\Parsoid\Core\DomPageBundle; |
| 9 | use Wikimedia\Parsoid\Core\HtmlPageBundle; |
| 10 | use Wikimedia\Parsoid\DOM\Element; |
| 11 | use Wikimedia\Parsoid\Html2Wt\DiffUtils; |
| 12 | use Wikimedia\Parsoid\Html2Wt\DOMDiff; |
| 13 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
| 14 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
| 15 | |
| 16 | /** |
| 17 | * This file contains code to classify opportunities for selective |
| 18 | * update and collect statistics. |
| 19 | */ |
| 20 | class ComputeSelectiveStats { |
| 21 | |
| 22 | /** |
| 23 | * @phpcs:ignore Generic.Files.LineLength.TooLong |
| 24 | * @return array{type: string, same_wt: string, rev_diff: string, changed_sections: string, changed_template_sites: string, changed_template_names: string} |
| 25 | */ |
| 26 | public static function classify( |
| 27 | Env $env, |
| 28 | ?PageConfig $oldPage, ?HtmlPageBundle $oldPb, |
| 29 | PageConfig $newPage, HtmlPageBundle $newPb |
| 30 | ): array { |
| 31 | // Default labels (ensure keys are consistent & in consistent order). |
| 32 | // Each label key should be a valid label name accepted by StatsLib, |
| 33 | // i.e. an alphanumeric string that does not include dashes (T394053). |
| 34 | $labels = [ |
| 35 | 'type' => 'missing-prev', |
| 36 | 'same_wt' => 'unknown', |
| 37 | 'rev_diff' => 'unknown', |
| 38 | 'changed_sections' => 'unknown', |
| 39 | 'changed_template_sites' => 'unknown', |
| 40 | 'changed_template_names' => 'unknown', |
| 41 | ]; |
| 42 | if ( $oldPage === null || $oldPb === null ) { |
| 43 | return $labels; |
| 44 | } |
| 45 | $oldWt = self::pc2wt( $oldPage ); |
| 46 | $newWt = self::pc2wt( $newPage ); |
| 47 | |
| 48 | // Compare wikitext in both revisions |
| 49 | $labels['same_wt'] = self::bool2str( $oldWt == $newWt ); |
| 50 | |
| 51 | // Compare revision IDs |
| 52 | $oldRev = $oldPage->getRevisionId(); |
| 53 | $newRev = $newPage->getRevisionId(); |
| 54 | if ( $oldRev === $newRev ) { |
| 55 | // same revision (template update, most likely) |
| 56 | $labels['rev_diff'] = '0'; |
| 57 | } elseif ( $oldRev === $newPage->getParentRevisionId() ) { |
| 58 | // "normal edit": new revision is the one after old revision |
| 59 | $labels['rev_diff'] = '1'; |
| 60 | } elseif ( $newRev === $oldPage->getParentRevisionId() ) { |
| 61 | // new revision is the one *before* old revision |
| 62 | // This is probably a render triggered from RevisionOutputCache |
| 63 | // of the previous revision where the "oldRev" is coming from |
| 64 | // the parser cache and is thus the latest. This may happen |
| 65 | // during races, vandalism patrol, HTML diffing, etc. |
| 66 | $labels['rev_diff'] = 'minus1'; |
| 67 | } |
| 68 | |
| 69 | // Parse to DOM and diff |
| 70 | $oldDoc = DomPageBundle::fromHtmlPageBundle( $oldPb )->toDom(); |
| 71 | $newDoc = DomPageBundle::fromHtmlPageBundle( $newPb )->toDom(); |
| 72 | $dd = new DOMDiff( $env ); |
| 73 | // Don't skip over template content! |
| 74 | $dd->skipEncapsulatedContent = false; |
| 75 | // Ignore differences in data-parsoid 'dsr' and 'tmp' |
| 76 | $cleanDP = static function ( DataParsoid $dp ): DataParsoid { |
| 77 | $dp = clone $dp; |
| 78 | foreach ( [ 'tmp', 'tsr', 'dsr', 'extTagOffsets', 'extLinkContentOffsets' ] as $prop ) { |
| 79 | unset( $dp->$prop ); |
| 80 | } |
| 81 | return $dp; |
| 82 | }; |
| 83 | $dd->specializedAttribHandlers['data-parsoid'] = static function ( |
| 84 | Element $nA, DataParsoid $vA, Element $nB, DataParsoid $vB |
| 85 | ) use ( $cleanDP ): bool { |
| 86 | // This is deliberately a not-strict equality comparisong between |
| 87 | // two DataParsoid objects. |
| 88 | // @phan-suppress-next-line PhanPluginComparisonObjectEqualityNotStrict |
| 89 | return $cleanDP( $vA ) == $cleanDP( $vB ); |
| 90 | }; |
| 91 | // Ignore differences in 'id' attributes, since these are a side-effect |
| 92 | // of data-parsoid/page bundle encapsulation. |
| 93 | $dd->specializedAttribHandlers['id'] = static function ( |
| 94 | Element $nA, string $vA, Element $nB, string $vB |
| 95 | ): bool { |
| 96 | // XXX we can't really tell synthethic ID attributes created by |
| 97 | // DOMDataUtils::storeInPageBundle() from "real" ID attributes |
| 98 | // in user wikitext. Hackishly ignore differences in any ID |
| 99 | // attributes that begin with 'mw' even though technically you |
| 100 | // could have a <span id="mw-something'> in wikitext, and change |
| 101 | // that to <span id='mw-different-thing'> and with this attribute |
| 102 | // handler DOM diff wouldn't flag the change. In theory we should |
| 103 | // be using shadow attributes to record when an id was synthetic. |
| 104 | if ( str_starts_with( $vA, 'mw' ) && str_starts_with( $vB, 'mw' ) ) { |
| 105 | return true; // equal enough |
| 106 | } |
| 107 | return $vA === $vB; |
| 108 | }; |
| 109 | [ 'isEmpty' => $emptyDiff ] = $dd->diff( |
| 110 | DOMCompat::getBody( $oldDoc ), |
| 111 | DOMCompat::getBody( $newDoc ) |
| 112 | ); |
| 113 | if ( $oldWt === $newWt ) { |
| 114 | // old and new wikitext identical. is html also identical? |
| 115 | $labels['type'] = $emptyDiff ? 'no-op' : 'template-update'; |
| 116 | } else { |
| 117 | $labels['type'] = 'page-update'; |
| 118 | } |
| 119 | |
| 120 | // Use a DOMTraverser to count how many sections and templates were |
| 121 | // modified. (Skip attribute embedded HTML for now.) |
| 122 | $dt = new DOMTraverser( true ); |
| 123 | $sectionsModified = 0; |
| 124 | $dt->addHandler( 'section', static function ( Element $el ) use ( &$sectionsModified ) { |
| 125 | if ( WTUtils::isParsoidSectionTag( $el ) && !DiffUtils::subtreeUnchanged( $el ) ) { |
| 126 | $sectionsModified++; |
| 127 | } |
| 128 | return true; |
| 129 | } ); |
| 130 | $templatesModified = 0; |
| 131 | $namedTemplates = []; |
| 132 | $dt->addHandler( null, static function ( $el, $state ) use ( &$templatesModified, &$namedTemplates ) { |
| 133 | if ( !( $el instanceof Element ) ) { |
| 134 | return true; |
| 135 | } |
| 136 | if ( |
| 137 | $el === ( $state->tplInfo->first ?? null ) && |
| 138 | DOMUtils::hasTypeOf( $el, 'mw:Transclusion' ) |
| 139 | ) { |
| 140 | $changed = false; |
| 141 | $about = DOMCompat::getAttribute( $el, 'about' ); |
| 142 | foreach ( WTUtils::getAboutSiblings( $el, $about ) as $sib ) { |
| 143 | // Note that we might miss a change here in a sibling |
| 144 | // which is fosterable IEW, since that's !Element. |
| 145 | if ( |
| 146 | $sib instanceof Element && |
| 147 | !DiffUtils::subtreeUnchanged( $sib ) |
| 148 | ) { |
| 149 | $changed = true; |
| 150 | break; |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | // Compute the number of templates modified |
| 155 | if ( $changed ) { |
| 156 | $templatesModified++; |
| 157 | $dataMw = DOMDataUtils::getDataMw( $el ); |
| 158 | $name = null; |
| 159 | foreach ( $dataMw->parts ?? [] as $part ) { |
| 160 | if ( $part instanceof TemplateInfo ) { |
| 161 | $name ??= $part->href; |
| 162 | } |
| 163 | } |
| 164 | $namedTemplates[$name ?? 'unknown'] = true; |
| 165 | } |
| 166 | // Don't recurse into templates, just tabulate top-level |
| 167 | $state->tplInfo->clear = true; |
| 168 | return $state->tplInfo->last->nextSibling; |
| 169 | } |
| 170 | return true; |
| 171 | } ); |
| 172 | # do the traversal |
| 173 | $dt->traverse( null, DOMCompat::getBody( $newDoc ), new DTState( $env ) ); |
| 174 | |
| 175 | # report changed sections as '0', '1', or '2+' |
| 176 | $labels['changed_sections'] = self::int2str( $sectionsModified, 2 ); |
| 177 | # report changed templates as '0', '1', or '2+' |
| 178 | $labels['changed_template_sites'] = self::int2str( $templatesModified, 2 ); |
| 179 | # report the count of the *names* of the templates that were updated. |
| 180 | $labels['changed_template_names'] = self::int2str( count( $namedTemplates ), 2 ); |
| 181 | |
| 182 | // TODO: sum up the time spent on modified (vs unmodified) templates |
| 183 | |
| 184 | return $labels; |
| 185 | } |
| 186 | |
| 187 | // ----------- Helper functions --------------- |
| 188 | |
| 189 | /** Convert a PageConfig to a wikitext string. */ |
| 190 | private static function pc2wt( PageConfig $pc ): string { |
| 191 | return $pc->getRevisionContent()->getContent( 'main' ); |
| 192 | } |
| 193 | |
| 194 | // See https://www.mediawiki.org/wiki/Manual:Stats#Cardinality |
| 195 | |
| 196 | /** Restrict the cardinality of user agent labels */ |
| 197 | public static function filterUserAgent( ?string $userAgent ): string { |
| 198 | static $acceptableAgents = [ |
| 199 | 'ChangePropagation_JobQueue_WMF' => true, |
| 200 | 'ChangePropagation_WMF' => true, |
| 201 | 'Mobileapps_WMF' => true, |
| 202 | 'RESTBase_WMF' => true, |
| 203 | 'C_WikiAPI' => true, |
| 204 | 'Java_7_0_for_MediaWikiAPI' => true, |
| 205 | ]; |
| 206 | static $agentPrefixes = [ |
| 207 | 'MediaWiki_API', |
| 208 | 'MediaWiki_Bot', |
| 209 | 'Mozilla_4_0', |
| 210 | 'Mozilla_5_0', |
| 211 | 'Mozilla', |
| 212 | 'REST_API_Crawler_Google', |
| 213 | 'IABot', |
| 214 | 'Rust_mediawiki_API', |
| 215 | 'ChangePropagation', // fallback |
| 216 | ]; |
| 217 | if ( $userAgent === null ) { |
| 218 | return 'unknown'; |
| 219 | } |
| 220 | // Replace non-alphanumeric characters, the same way that core does |
| 221 | // See mediawiki-core:includes/libs/Stats/StatsUtils::normalizeString() |
| 222 | $userAgent = preg_replace( '/\W+/', '_', $userAgent ); |
| 223 | $userAgent = trim( $userAgent, "_" ); |
| 224 | if ( $acceptableAgents[$userAgent] ?? false ) { |
| 225 | return $userAgent; |
| 226 | } |
| 227 | foreach ( $agentPrefixes as $prefix ) { |
| 228 | if ( str_starts_with( $userAgent, $prefix ) ) { |
| 229 | return $prefix; |
| 230 | } |
| 231 | } |
| 232 | return 'other'; |
| 233 | } |
| 234 | |
| 235 | /** |
| 236 | * Convert a boolean to a string for labelling purposes. |
| 237 | * |
| 238 | * @phan-return 'false'|'true'|'unknown' |
| 239 | */ |
| 240 | private static function bool2str( ?bool $val ): string { |
| 241 | return ( $val === true ) ? 'true' : ( |
| 242 | ( $val === false ) ? 'false' : 'unknown' |
| 243 | ); |
| 244 | } |
| 245 | |
| 246 | /** |
| 247 | * Convert an integer to a string for labelling purposes, |
| 248 | * restricting its cardinality. |
| 249 | */ |
| 250 | private static function int2str( ?int $val, ?int $limit = null ): string { |
| 251 | if ( $val === null ) { |
| 252 | return 'unknown'; |
| 253 | } |
| 254 | if ( $limit !== null && $val >= $limit ) { |
| 255 | return "{$limit}plus"; |
| 256 | } |
| 257 | return "$val"; |
| 258 | } |
| 259 | } |