Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
86.84% covered (warning)
86.84%
33 / 38
66.67% covered (warning)
66.67%
2 / 3
CRAP
0.00% covered (danger)
0.00%
0 / 1
TextCat
86.84% covered (warning)
86.84%
33 / 38
66.67% covered (warning)
66.67%
2 / 3
17.66
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 detect
86.11% covered (warning)
86.11%
31 / 36
0.00% covered (danger)
0.00%
0 / 1
15.60
 build
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\LanguageDetector;
4
5use CirrusSearch\SearchConfig;
6use MediaWiki\Logger\LoggerFactory;
7
8/**
9 * Try to detect language with TextCat text categorizer
10 */
11class TextCat implements Detector {
12
13    /**
14     * @var SearchConfig
15     */
16    private $config;
17
18    /**
19     * @param SearchConfig $config
20     */
21    public function __construct( SearchConfig $config ) {
22        $this->config = $config;
23    }
24
25    /**
26     * Detect language
27     *
28     * @param string $text Text to detect language
29     * @return string|null Preferred language, or null if none found
30     */
31    public function detect( $text ) {
32        $dirs = $this->config->getElement( 'CirrusSearchTextcatModel' );
33        if ( !$dirs ) {
34            return null;
35        }
36        if ( !is_array( $dirs ) ) { // backward compatibility
37            $dirs = [ $dirs ];
38        }
39        foreach ( $dirs as $dir ) {
40            if ( !is_dir( $dir ) ) {
41                LoggerFactory::getInstance( 'CirrusSearch' )->warning(
42                    "Bad directory for TextCat model: {dir}",
43                    [ "dir" => $dir ]
44                );
45            }
46        }
47
48        $textcat = new \TextCat( $dirs );
49
50        $textcatConfig = $this->config->getElement( 'CirrusSearchTextcatConfig' );
51        if ( $textcatConfig ) {
52            if ( isset( $textcatConfig['maxNgrams'] ) ) {
53                $textcat->setMaxNgrams( intval( $textcatConfig['maxNgrams'] ) );
54            }
55            if ( isset( $textcatConfig['maxReturnedLanguages'] ) ) {
56                $textcat->setMaxReturnedLanguages( intval( $textcatConfig['maxReturnedLanguages'] ) );
57            }
58            if ( isset( $textcatConfig['resultsRatio'] ) ) {
59                $textcat->setResultsRatio( floatval( $textcatConfig['resultsRatio'] ) );
60            }
61            if ( isset( $textcatConfig['minInputLength'] ) ) {
62                $textcat->setMinInputLength( intval( $textcatConfig['minInputLength'] ) );
63            }
64            if ( isset( $textcatConfig['maxProportion'] ) ) {
65                $textcat->setMaxProportion( floatval( $textcatConfig['maxProportion'] ) );
66            }
67            if ( isset( $textcatConfig['langBoostScore'] ) ) {
68                $textcat->setLangBoostScore( floatval( $textcatConfig['langBoostScore'] ) );
69            }
70
71            if ( isset( $textcatConfig['numBoostedLangs'] ) &&
72                $this->config->getElement( 'CirrusSearchTextcatLanguages' )
73            ) {
74                $textcat->setBoostedLangs( array_slice(
75                    $this->config->getElement( 'CirrusSearchTextcatLanguages' ),
76                    0, $textcatConfig['numBoostedLangs'] ) );
77            }
78        }
79        $languages = $textcat->classify( $text, $this->config->getElement( 'CirrusSearchTextcatLanguages' ) );
80        if ( $languages ) {
81            // For now, just return the best option
82            // TODO: think what else we could do
83            reset( $languages );
84            return key( $languages );
85        }
86
87        return null;
88    }
89
90    /**
91     * @param SearchConfig $config
92     * @return Detector
93     */
94    public static function build( SearchConfig $config ) {
95        return new self( $config );
96    }
97}