Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 93 |
|
0.00% |
0 / 4 |
CRAP | |
0.00% |
0 / 1 |
ComputeSelectiveStats | |
0.00% |
0 / 93 |
|
0.00% |
0 / 4 |
930 | |
0.00% |
0 / 1 |
classify | |
0.00% |
0 / 84 |
|
0.00% |
0 / 1 |
506 | |||
pc2wt | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
bool2str | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
int2str | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\Config\PageConfig; |
8 | use Wikimedia\Parsoid\Core\DomPageBundle; |
9 | use Wikimedia\Parsoid\Core\PageBundle; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\Html2Wt\DiffUtils; |
12 | use Wikimedia\Parsoid\Html2Wt\DOMDiff; |
13 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
14 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
15 | |
16 | /** |
17 | * This file contains code to classify opportunities for selective |
18 | * update and collect statistics. |
19 | */ |
20 | class ComputeSelectiveStats { |
21 | |
22 | /** @return array<string,string> */ |
23 | public static function classify( |
24 | Env $env, |
25 | ?PageConfig $oldPage, ?PageBundle $oldPb, |
26 | PageConfig $newPage, PageBundle $newPb |
27 | ): array { |
28 | // Default labels (ensure keys are consistent & in consistent order) |
29 | $labels = [ |
30 | 'type' => 'missing-prev', |
31 | 'same-wt' => 'unknown', |
32 | 'rev-diff' => 'unknown', |
33 | 'changed-sections' => 'unknown', |
34 | 'changed-template-sites' => 'unknown', |
35 | 'changed-template-names' => 'unknown', |
36 | ]; |
37 | if ( $oldPage === null || $oldPb === null ) { |
38 | return $labels; |
39 | } |
40 | $oldWt = self::pc2wt( $oldPage ); |
41 | $newWt = self::pc2wt( $newPage ); |
42 | |
43 | // Compare wikitext in both revisions |
44 | $labels['same-wt'] = self::bool2str( $oldWt == $newWt ); |
45 | |
46 | // Compare revision IDs |
47 | $oldRev = $oldPage->getRevisionId(); |
48 | $newRev = $newPage->getRevisionId(); |
49 | if ( $oldRev === $newRev ) { |
50 | // same revision (template update, most likely) |
51 | $labels['rev-diff'] = '0'; |
52 | } elseif ( $oldRev === $newPage->getParentRevisionId() ) { |
53 | // "normal edit": new revision is the one after old revision |
54 | $labels['rev-diff'] = '1'; |
55 | } elseif ( $newRev === $oldPage->getParentRevisionId() ) { |
56 | // new revision is the one *before* old revision |
57 | // This is probably a render triggered from RevisionOutputCache |
58 | // of the previous revision where the "oldRev" is coming from |
59 | // the parser cache and is thus the latest. This may happen |
60 | // during races, vandalism patrol, HTML diffing, etc. |
61 | $labels['rev-diff'] = 'minus1'; |
62 | } |
63 | |
64 | // Parse to DOM and diff |
65 | $oldDoc = DomPageBundle::fromPageBundle( $oldPb )->toDom(); |
66 | $newDoc = DomPageBundle::fromPageBundle( $newPb )->toDom(); |
67 | $dd = new DOMDiff( $env ); |
68 | // Don't skip over template content! |
69 | $dd->skipEncapsulatedContent = false; |
70 | // Ignore differences in data-parsoid 'dsr' and 'tmp' |
71 | $cleanDP = static function ( DataParsoid $dp ): DataParsoid { |
72 | $dp = clone $dp; |
73 | foreach ( [ 'tmp', 'tsr', 'dsr', 'extTagOffsets', 'extLinkContentOffsets' ] as $prop ) { |
74 | unset( $dp->$prop ); |
75 | } |
76 | return $dp; |
77 | }; |
78 | $dd->specializedAttribHandlers['data-parsoid'] = static function ( $nA, $vA, $nB, $vB ) use ( $cleanDP ) { |
79 | // This is deliberately a not-strict equality comparisong between |
80 | // two DataParsoid objects. |
81 | // @phan-suppress-next-line PhanPluginComparisonObjectEqualityNotStrict |
82 | return $cleanDP( $vA ) == $cleanDP( $vB ); |
83 | }; |
84 | // Ignore differences in 'id' attributes, since these are a side-effect |
85 | // of data-parsoid/page bundle encapsulation. |
86 | $dd->specializedAttribHandlers['id'] = static function ( $nA, $vA, $nB, $vB ) { |
87 | // XXX we can't really tell synthethic ID attributes created by |
88 | // DOMDataUtils::storeInPageBundle() from "real" ID attributes |
89 | // in user wikitext. Hackishly ignore differences in any ID |
90 | // attributes that begin with 'mw' even though technically you |
91 | // could have a <span id="mw-something'> in wikitext, and change |
92 | // that to <span id='mw-different-thing'> and with this attribute |
93 | // handler DOM diff wouldn't flag the change. In theory we should |
94 | // be using shadow attributes to record when an id was synthetic. |
95 | if ( str_starts_with( $vA, 'mw' ) && str_starts_with( $vB, 'mw' ) ) { |
96 | return true; // equal enough |
97 | } |
98 | return $vA === $vB; |
99 | }; |
100 | [ 'isEmpty' => $emptyDiff ] = $dd->diff( |
101 | DOMCompat::getBody( $oldDoc ), |
102 | DOMCompat::getBody( $newDoc ) |
103 | ); |
104 | if ( $oldWt === $newWt ) { |
105 | // old and new wikitext identical. is html also identical? |
106 | $labels['type'] = $emptyDiff ? 'no-op' : 'template-update'; |
107 | } else { |
108 | $labels['type'] = 'page-update'; |
109 | } |
110 | |
111 | // Use a DOMTraverser to count how many sections and templates were |
112 | // modified. (Skip attribute embedded HTML for now.) |
113 | $dt = new DOMTraverser( true ); |
114 | $sectionsModified = 0; |
115 | $dt->addHandler( 'section', static function ( Element $el ) use ( &$sectionsModified ) { |
116 | if ( WTUtils::isParsoidSectionTag( $el ) && !DiffUtils::subtreeUnchanged( $el ) ) { |
117 | $sectionsModified++; |
118 | } |
119 | return true; |
120 | } ); |
121 | $templatesModified = 0; |
122 | $namedTemplates = []; |
123 | $dt->addHandler( null, static function ( $el, $state ) use ( &$templatesModified, &$namedTemplates ) { |
124 | if ( !( $el instanceof Element ) ) { |
125 | return true; |
126 | } |
127 | if ( |
128 | $el === ( $state->tplInfo->first ?? null ) && |
129 | DOMUtils::hasTypeOf( $el, 'mw:Transclusion' ) |
130 | ) { |
131 | $changed = false; |
132 | $about = DOMCompat::getAttribute( $el, 'about' ); |
133 | foreach ( WTUtils::getAboutSiblings( $el, $about ) as $sib ) { |
134 | // Note that we might miss a change here in a sibling |
135 | // which is fosterable IEW, since that's !Element. |
136 | if ( |
137 | $sib instanceof Element && |
138 | !DiffUtils::subtreeUnchanged( $sib ) |
139 | ) { |
140 | $changed = true; |
141 | break; |
142 | } |
143 | } |
144 | |
145 | // Compute the number of templates modified |
146 | if ( $changed ) { |
147 | $templatesModified++; |
148 | $dataMw = DOMDataUtils::getDataMw( $el ); |
149 | $name = null; |
150 | foreach ( $dataMw->parts ?? [] as $part ) { |
151 | if ( $part instanceof TemplateInfo ) { |
152 | $name ??= $part->href; |
153 | } |
154 | } |
155 | $namedTemplates[$name ?? 'unknown'] = true; |
156 | } |
157 | // Don't recurse into templates, just tabulate top-level |
158 | $state->tplInfo->clear = true; |
159 | return $state->tplInfo->last->nextSibling; |
160 | } |
161 | return true; |
162 | } ); |
163 | # do the traversal |
164 | $dt->traverse( null, DOMCompat::getBody( $newDoc ), new DTState( $env ) ); |
165 | |
166 | # report changed sections as '0', '1', or '2+' |
167 | $labels['changed-sections'] = self::int2str( $sectionsModified, 2 ); |
168 | # report changed templates as '0', '1', or '2+' |
169 | $labels['changed-template-sites'] = self::int2str( $templatesModified, 2 ); |
170 | # report the count of the *names* of the templates that were updated. |
171 | $labels['changed-template-names'] = self::int2str( count( $namedTemplates ), 2 ); |
172 | |
173 | // TODO: sum up the time spent on modified (vs unmodified) templates |
174 | |
175 | return $labels; |
176 | } |
177 | |
178 | // ----------- Helper functions --------------- |
179 | |
180 | /** Convert a PageConfig to a wikitext string. */ |
181 | private static function pc2wt( PageConfig $pc ): string { |
182 | return $pc->getRevisionContent()->getContent( 'main' ); |
183 | } |
184 | |
185 | /** Convert a boolean to a string for labelling purposes. */ |
186 | private static function bool2str( ?bool $val ): string { |
187 | return ( $val === true ) ? 'true' : ( |
188 | ( $val === false ) ? 'false' : 'unknown' |
189 | ); |
190 | } |
191 | |
192 | /** Convert an integer to a string for labelling purposes. */ |
193 | private static function int2str( ?int $val, ?int $limit = null ): string { |
194 | if ( $val === null ) { |
195 | return 'unknown'; |
196 | } |
197 | if ( $limit !== null && $val >= $limit ) { |
198 | return "{$limit}plus"; |
199 | } |
200 | return "$val"; |
201 | } |
202 | } |