Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 119 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
ComputeSelectiveStats | |
0.00% |
0 / 119 |
|
0.00% |
0 / 5 |
1260 | |
0.00% |
0 / 1 |
classify | |
0.00% |
0 / 84 |
|
0.00% |
0 / 1 |
506 | |||
pc2wt | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
filterUserAgent | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
30 | |||
bool2str | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
int2str | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\Config\PageConfig; |
8 | use Wikimedia\Parsoid\Core\DomPageBundle; |
9 | use Wikimedia\Parsoid\Core\PageBundle; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\Html2Wt\DiffUtils; |
12 | use Wikimedia\Parsoid\Html2Wt\DOMDiff; |
13 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
14 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
15 | |
16 | /** |
17 | * This file contains code to classify opportunities for selective |
18 | * update and collect statistics. |
19 | */ |
20 | class ComputeSelectiveStats { |
21 | |
22 | /** @return array<string,string> */ |
23 | public static function classify( |
24 | Env $env, |
25 | ?PageConfig $oldPage, ?PageBundle $oldPb, |
26 | PageConfig $newPage, PageBundle $newPb |
27 | ): array { |
28 | // Default labels (ensure keys are consistent & in consistent order) |
29 | $labels = [ |
30 | 'type' => 'missing-prev', |
31 | 'same-wt' => 'unknown', |
32 | 'rev-diff' => 'unknown', |
33 | 'changed-sections' => 'unknown', |
34 | 'changed-template-sites' => 'unknown', |
35 | 'changed-template-names' => 'unknown', |
36 | ]; |
37 | if ( $oldPage === null || $oldPb === null ) { |
38 | return $labels; |
39 | } |
40 | $oldWt = self::pc2wt( $oldPage ); |
41 | $newWt = self::pc2wt( $newPage ); |
42 | |
43 | // Compare wikitext in both revisions |
44 | $labels['same-wt'] = self::bool2str( $oldWt == $newWt ); |
45 | |
46 | // Compare revision IDs |
47 | $oldRev = $oldPage->getRevisionId(); |
48 | $newRev = $newPage->getRevisionId(); |
49 | if ( $oldRev === $newRev ) { |
50 | // same revision (template update, most likely) |
51 | $labels['rev-diff'] = '0'; |
52 | } elseif ( $oldRev === $newPage->getParentRevisionId() ) { |
53 | // "normal edit": new revision is the one after old revision |
54 | $labels['rev-diff'] = '1'; |
55 | } elseif ( $newRev === $oldPage->getParentRevisionId() ) { |
56 | // new revision is the one *before* old revision |
57 | // This is probably a render triggered from RevisionOutputCache |
58 | // of the previous revision where the "oldRev" is coming from |
59 | // the parser cache and is thus the latest. This may happen |
60 | // during races, vandalism patrol, HTML diffing, etc. |
61 | $labels['rev-diff'] = 'minus1'; |
62 | } |
63 | |
64 | // Parse to DOM and diff |
65 | $oldDoc = DomPageBundle::fromPageBundle( $oldPb )->toDom(); |
66 | $newDoc = DomPageBundle::fromPageBundle( $newPb )->toDom(); |
67 | $dd = new DOMDiff( $env ); |
68 | // Don't skip over template content! |
69 | $dd->skipEncapsulatedContent = false; |
70 | // Ignore differences in data-parsoid 'dsr' and 'tmp' |
71 | $cleanDP = static function ( DataParsoid $dp ): DataParsoid { |
72 | $dp = clone $dp; |
73 | foreach ( [ 'tmp', 'tsr', 'dsr', 'extTagOffsets', 'extLinkContentOffsets' ] as $prop ) { |
74 | unset( $dp->$prop ); |
75 | } |
76 | return $dp; |
77 | }; |
78 | $dd->specializedAttribHandlers['data-parsoid'] = static function ( $nA, $vA, $nB, $vB ) use ( $cleanDP ) { |
79 | // This is deliberately a not-strict equality comparisong between |
80 | // two DataParsoid objects. |
81 | // @phan-suppress-next-line PhanPluginComparisonObjectEqualityNotStrict |
82 | return $cleanDP( $vA ) == $cleanDP( $vB ); |
83 | }; |
84 | // Ignore differences in 'id' attributes, since these are a side-effect |
85 | // of data-parsoid/page bundle encapsulation. |
86 | $dd->specializedAttribHandlers['id'] = static function ( $nA, $vA, $nB, $vB ) { |
87 | // XXX we can't really tell synthethic ID attributes created by |
88 | // DOMDataUtils::storeInPageBundle() from "real" ID attributes |
89 | // in user wikitext. Hackishly ignore differences in any ID |
90 | // attributes that begin with 'mw' even though technically you |
91 | // could have a <span id="mw-something'> in wikitext, and change |
92 | // that to <span id='mw-different-thing'> and with this attribute |
93 | // handler DOM diff wouldn't flag the change. In theory we should |
94 | // be using shadow attributes to record when an id was synthetic. |
95 | if ( str_starts_with( $vA, 'mw' ) && str_starts_with( $vB, 'mw' ) ) { |
96 | return true; // equal enough |
97 | } |
98 | return $vA === $vB; |
99 | }; |
100 | [ 'isEmpty' => $emptyDiff ] = $dd->diff( |
101 | DOMCompat::getBody( $oldDoc ), |
102 | DOMCompat::getBody( $newDoc ) |
103 | ); |
104 | if ( $oldWt === $newWt ) { |
105 | // old and new wikitext identical. is html also identical? |
106 | $labels['type'] = $emptyDiff ? 'no-op' : 'template-update'; |
107 | } else { |
108 | $labels['type'] = 'page-update'; |
109 | } |
110 | |
111 | // Use a DOMTraverser to count how many sections and templates were |
112 | // modified. (Skip attribute embedded HTML for now.) |
113 | $dt = new DOMTraverser( true ); |
114 | $sectionsModified = 0; |
115 | $dt->addHandler( 'section', static function ( Element $el ) use ( &$sectionsModified ) { |
116 | if ( WTUtils::isParsoidSectionTag( $el ) && !DiffUtils::subtreeUnchanged( $el ) ) { |
117 | $sectionsModified++; |
118 | } |
119 | return true; |
120 | } ); |
121 | $templatesModified = 0; |
122 | $namedTemplates = []; |
123 | $dt->addHandler( null, static function ( $el, $state ) use ( &$templatesModified, &$namedTemplates ) { |
124 | if ( !( $el instanceof Element ) ) { |
125 | return true; |
126 | } |
127 | if ( |
128 | $el === ( $state->tplInfo->first ?? null ) && |
129 | DOMUtils::hasTypeOf( $el, 'mw:Transclusion' ) |
130 | ) { |
131 | $changed = false; |
132 | $about = DOMCompat::getAttribute( $el, 'about' ); |
133 | foreach ( WTUtils::getAboutSiblings( $el, $about ) as $sib ) { |
134 | // Note that we might miss a change here in a sibling |
135 | // which is fosterable IEW, since that's !Element. |
136 | if ( |
137 | $sib instanceof Element && |
138 | !DiffUtils::subtreeUnchanged( $sib ) |
139 | ) { |
140 | $changed = true; |
141 | break; |
142 | } |
143 | } |
144 | |
145 | // Compute the number of templates modified |
146 | if ( $changed ) { |
147 | $templatesModified++; |
148 | $dataMw = DOMDataUtils::getDataMw( $el ); |
149 | $name = null; |
150 | foreach ( $dataMw->parts ?? [] as $part ) { |
151 | if ( $part instanceof TemplateInfo ) { |
152 | $name ??= $part->href; |
153 | } |
154 | } |
155 | $namedTemplates[$name ?? 'unknown'] = true; |
156 | } |
157 | // Don't recurse into templates, just tabulate top-level |
158 | $state->tplInfo->clear = true; |
159 | return $state->tplInfo->last->nextSibling; |
160 | } |
161 | return true; |
162 | } ); |
163 | # do the traversal |
164 | $dt->traverse( null, DOMCompat::getBody( $newDoc ), new DTState( $env ) ); |
165 | |
166 | # report changed sections as '0', '1', or '2+' |
167 | $labels['changed-sections'] = self::int2str( $sectionsModified, 2 ); |
168 | # report changed templates as '0', '1', or '2+' |
169 | $labels['changed-template-sites'] = self::int2str( $templatesModified, 2 ); |
170 | # report the count of the *names* of the templates that were updated. |
171 | $labels['changed-template-names'] = self::int2str( count( $namedTemplates ), 2 ); |
172 | |
173 | // TODO: sum up the time spent on modified (vs unmodified) templates |
174 | |
175 | return $labels; |
176 | } |
177 | |
178 | // ----------- Helper functions --------------- |
179 | |
180 | /** Convert a PageConfig to a wikitext string. */ |
181 | private static function pc2wt( PageConfig $pc ): string { |
182 | return $pc->getRevisionContent()->getContent( 'main' ); |
183 | } |
184 | |
185 | // See https://www.mediawiki.org/wiki/Manual:Stats#Cardinality |
186 | |
187 | /** Restrict the cardinality of user agent labels */ |
188 | public static function filterUserAgent( ?string $userAgent ): string { |
189 | static $acceptableAgents = [ |
190 | 'ChangePropagation_JobQueue_WMF' => true, |
191 | 'ChangePropagation_WMF' => true, |
192 | 'Mobileapps_WMF' => true, |
193 | 'RESTBase_WMF' => true, |
194 | 'C_WikiAPI' => true, |
195 | 'Java_7_0_for_MediaWikiAPI' => true, |
196 | ]; |
197 | static $agentPrefixes = [ |
198 | 'MediaWiki_API', |
199 | 'MediaWiki_Bot', |
200 | 'Mozilla_4_0', |
201 | 'Mozilla_5_0', |
202 | 'Mozilla', |
203 | 'REST_API_Crawler_Google', |
204 | 'IABot', |
205 | 'Rust_mediawiki_API', |
206 | ]; |
207 | if ( $userAgent === null ) { |
208 | return 'unknown'; |
209 | } |
210 | if ( $acceptableAgents[$userAgent] ?? false ) { |
211 | return $userAgent; |
212 | } |
213 | foreach ( $agentPrefixes as $prefix ) { |
214 | if ( str_starts_with( $userAgent, $prefix ) ) { |
215 | return $prefix; |
216 | } |
217 | } |
218 | return 'other'; |
219 | } |
220 | |
221 | /** Convert a boolean to a string for labelling purposes. */ |
222 | private static function bool2str( ?bool $val ): string { |
223 | return ( $val === true ) ? 'true' : ( |
224 | ( $val === false ) ? 'false' : 'unknown' |
225 | ); |
226 | } |
227 | |
228 | /** |
229 | * Convert an integer to a string for labelling purposes, |
230 | * restricting its cardinality. |
231 | */ |
232 | private static function int2str( ?int $val, ?int $limit = null ): string { |
233 | if ( $val === null ) { |
234 | return 'unknown'; |
235 | } |
236 | if ( $limit !== null && $val >= $limit ) { |
237 | return "{$limit}plus"; |
238 | } |
239 | return "$val"; |
240 | } |
241 | } |