Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 33
0.00% covered (danger)
0.00%
0 / 3
CRAP
0.00% covered (danger)
0.00%
0 / 1
EditDistanceStringComparator
0.00% covered (danger)
0.00%
0 / 33
0.00% covered (danger)
0.00%
0 / 3
240
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getSimilarity
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
12
 levenshtein
0.00% covered (danger)
0.00%
0 / 24
0.00% covered (danger)
0.00%
0 / 1
132
1<?php
2declare( strict_types = 1 );
3
4namespace MediaWiki\Extension\Translate\Utilities\StringComparators;
5
6/**
7 * Smart string comparator that uses simple string comparison, and then
8 * the levenshtein algorithm to compare two strings.
9 * @author Abijeet Patro
10 * @since 2023.11
11 * @license GPL-2.0-or-later
12 */
13class EditDistanceStringComparator implements StringComparator {
14    private SimpleStringComparator $simpleStringComparator;
15
16    public function __construct() {
17        $this->simpleStringComparator = new SimpleStringComparator();
18    }
19
20    public function getSimilarity( $a, $b ) {
21        $similarity = $this->simpleStringComparator->getSimilarity( $a, $b );
22        if ( $similarity > 0.9 ) {
23            return $similarity;
24        }
25
26        $similarity = $this->levenshtein( $a, $b, mb_strlen( $a ), mb_strlen( $b ) );
27        if ( $similarity === -1 ) {
28            return 0;
29        }
30
31        // See: https://stackoverflow.com/a/59585447
32        $maxLength = max( strlen( $a ), strlen( $b ) );
33        return ( $maxLength - $similarity ) / $maxLength;
34    }
35
36    /**
37     * PHP implementation of Levenshtein edit distance algorithm. Uses the native PHP implementation
38     * when possible for speed. The native levenshtein is limited to 255 bytes.
39     */
40    public function levenshtein( string $str1, string $str2, int $length1, int $length2 ): int {
41        if ( $length1 === 0 ) {
42            return $length2;
43        }
44
45        if ( $length2 === 0 ) {
46            return $length1;
47        }
48
49        if ( $str1 === $str2 ) {
50            return 0;
51        }
52
53        $byteLength1 = strlen( $str1 );
54        $byteLength2 = strlen( $str2 );
55        if ( $byteLength1 === $length1 && $byteLength1 <= 255
56            && $byteLength2 === $length2 && $byteLength2 <= 255
57        ) {
58            return levenshtein( $str1, $str2 );
59        }
60
61        $prevRow = range( 0, $length2 );
62        for ( $i = 0; $i < $length1; $i++ ) {
63            $currentRow = [];
64            $currentRow[0] = $i + 1;
65            $c1 = mb_substr( $str1, $i, 1 );
66            for ( $j = 0; $j < $length2; $j++ ) {
67                $c2 = mb_substr( $str2, $j, 1 );
68                $insertions = $prevRow[$j + 1] + 1;
69                $deletions = $currentRow[$j] + 1;
70                $substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
71                $currentRow[] = min( $insertions, $deletions, $substitutions );
72            }
73            $prevRow = $currentRow;
74        }
75
76        return $prevRow[$length2];
77    }
78}