Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
86.84% |
33 / 38 |
|
66.67% |
2 / 3 |
CRAP | |
0.00% |
0 / 1 |
TextCat | |
86.84% |
33 / 38 |
|
66.67% |
2 / 3 |
17.66 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
detect | |
86.11% |
31 / 36 |
|
0.00% |
0 / 1 |
15.60 | |||
build | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\LanguageDetector; |
4 | |
5 | use CirrusSearch\SearchConfig; |
6 | use MediaWiki\Logger\LoggerFactory; |
7 | |
8 | /** |
9 | * Try to detect language with TextCat text categorizer |
10 | */ |
11 | class TextCat implements Detector { |
12 | |
13 | /** |
14 | * @var SearchConfig |
15 | */ |
16 | private $config; |
17 | |
18 | /** |
19 | * @param SearchConfig $config |
20 | */ |
21 | public function __construct( SearchConfig $config ) { |
22 | $this->config = $config; |
23 | } |
24 | |
25 | /** |
26 | * Detect language |
27 | * |
28 | * @param string $text Text to detect language |
29 | * @return string|null Preferred language, or null if none found |
30 | */ |
31 | public function detect( $text ) { |
32 | $dirs = $this->config->getElement( 'CirrusSearchTextcatModel' ); |
33 | if ( !$dirs ) { |
34 | return null; |
35 | } |
36 | if ( !is_array( $dirs ) ) { // backward compatibility |
37 | $dirs = [ $dirs ]; |
38 | } |
39 | foreach ( $dirs as $dir ) { |
40 | if ( !is_dir( $dir ) ) { |
41 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
42 | "Bad directory for TextCat model: {dir}", |
43 | [ "dir" => $dir ] |
44 | ); |
45 | } |
46 | } |
47 | |
48 | $textcat = new \TextCat( $dirs ); |
49 | |
50 | $textcatConfig = $this->config->getElement( 'CirrusSearchTextcatConfig' ); |
51 | if ( $textcatConfig ) { |
52 | if ( isset( $textcatConfig['maxNgrams'] ) ) { |
53 | $textcat->setMaxNgrams( intval( $textcatConfig['maxNgrams'] ) ); |
54 | } |
55 | if ( isset( $textcatConfig['maxReturnedLanguages'] ) ) { |
56 | $textcat->setMaxReturnedLanguages( intval( $textcatConfig['maxReturnedLanguages'] ) ); |
57 | } |
58 | if ( isset( $textcatConfig['resultsRatio'] ) ) { |
59 | $textcat->setResultsRatio( floatval( $textcatConfig['resultsRatio'] ) ); |
60 | } |
61 | if ( isset( $textcatConfig['minInputLength'] ) ) { |
62 | $textcat->setMinInputLength( intval( $textcatConfig['minInputLength'] ) ); |
63 | } |
64 | if ( isset( $textcatConfig['maxProportion'] ) ) { |
65 | $textcat->setMaxProportion( floatval( $textcatConfig['maxProportion'] ) ); |
66 | } |
67 | if ( isset( $textcatConfig['langBoostScore'] ) ) { |
68 | $textcat->setLangBoostScore( floatval( $textcatConfig['langBoostScore'] ) ); |
69 | } |
70 | |
71 | if ( isset( $textcatConfig['numBoostedLangs'] ) && |
72 | $this->config->getElement( 'CirrusSearchTextcatLanguages' ) |
73 | ) { |
74 | $textcat->setBoostedLangs( array_slice( |
75 | $this->config->getElement( 'CirrusSearchTextcatLanguages' ), |
76 | 0, $textcatConfig['numBoostedLangs'] ) ); |
77 | } |
78 | } |
79 | $languages = $textcat->classify( $text, $this->config->getElement( 'CirrusSearchTextcatLanguages' ) ); |
80 | if ( $languages ) { |
81 | // For now, just return the best option |
82 | // TODO: think what else we could do |
83 | reset( $languages ); |
84 | return key( $languages ); |
85 | } |
86 | |
87 | return null; |
88 | } |
89 | |
90 | /** |
91 | * @param SearchConfig $config |
92 | * @return Detector |
93 | */ |
94 | public static function build( SearchConfig $config ) { |
95 | return new self( $config ); |
96 | } |
97 | } |