Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 33 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
EditDistanceStringComparator | |
0.00% |
0 / 33 |
|
0.00% |
0 / 3 |
240 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getSimilarity | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
levenshtein | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
132 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace MediaWiki\Extension\Translate\Utilities\StringComparators; |
5 | |
6 | /** |
7 | * Smart string comparator that uses simple string comparison, and then |
8 | * the levenshtein algorithm to compare two strings. |
9 | * @author Abijeet Patro |
10 | * @since 2023.11 |
11 | * @license GPL-2.0-or-later |
12 | */ |
13 | class EditDistanceStringComparator implements StringComparator { |
14 | private SimpleStringComparator $simpleStringComparator; |
15 | |
16 | public function __construct() { |
17 | $this->simpleStringComparator = new SimpleStringComparator(); |
18 | } |
19 | |
20 | public function getSimilarity( $a, $b ) { |
21 | $similarity = $this->simpleStringComparator->getSimilarity( $a, $b ); |
22 | if ( $similarity > 0.9 ) { |
23 | return $similarity; |
24 | } |
25 | |
26 | $similarity = $this->levenshtein( $a, $b, mb_strlen( $a ), mb_strlen( $b ) ); |
27 | if ( $similarity === -1 ) { |
28 | return 0; |
29 | } |
30 | |
31 | // See: https://stackoverflow.com/a/59585447 |
32 | $maxLength = max( strlen( $a ), strlen( $b ) ); |
33 | return ( $maxLength - $similarity ) / $maxLength; |
34 | } |
35 | |
36 | /** |
37 | * PHP implementation of Levenshtein edit distance algorithm. Uses the native PHP implementation |
38 | * when possible for speed. The native levenshtein is limited to 255 bytes. |
39 | */ |
40 | public function levenshtein( string $str1, string $str2, int $length1, int $length2 ): int { |
41 | if ( $length1 === 0 ) { |
42 | return $length2; |
43 | } |
44 | |
45 | if ( $length2 === 0 ) { |
46 | return $length1; |
47 | } |
48 | |
49 | if ( $str1 === $str2 ) { |
50 | return 0; |
51 | } |
52 | |
53 | $byteLength1 = strlen( $str1 ); |
54 | $byteLength2 = strlen( $str2 ); |
55 | if ( $byteLength1 === $length1 && $byteLength1 <= 255 |
56 | && $byteLength2 === $length2 && $byteLength2 <= 255 |
57 | ) { |
58 | return levenshtein( $str1, $str2 ); |
59 | } |
60 | |
61 | $prevRow = range( 0, $length2 ); |
62 | for ( $i = 0; $i < $length1; $i++ ) { |
63 | $currentRow = []; |
64 | $currentRow[0] = $i + 1; |
65 | $c1 = mb_substr( $str1, $i, 1 ); |
66 | for ( $j = 0; $j < $length2; $j++ ) { |
67 | $c2 = mb_substr( $str2, $j, 1 ); |
68 | $insertions = $prevRow[$j + 1] + 1; |
69 | $deletions = $currentRow[$j] + 1; |
70 | $substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 ); |
71 | $currentRow[] = min( $insertions, $deletions, $substitutions ); |
72 | } |
73 | $prevRow = $currentRow; |
74 | } |
75 | |
76 | return $prevRow[$length2]; |
77 | } |
78 | } |