@@ -47,17 +47,17 @@ public function setMinFreq( $minFreq ) {
4747 /**
4848 * @param string $dir
4949 */
50- public function __construct ($ dir = null ) {
51- if ( empty ($ dir) ) {
50+ public function __construct ( $ dir = null ) {
51+ if ( empty ( $ dir ) ) {
5252 $ dir = __DIR__ ."/LM " ;
5353 }
5454 $ this ->dir = $ dir ;
55- foreach ( new DirectoryIterator ($ dir ) as $ file ) {
56- if ( !$ file ->isFile ()) {
55+ foreach ( new DirectoryIterator ( $ dir ) as $ file ) {
56+ if ( !$ file ->isFile () ) {
5757 continue ;
5858 }
59- if ( $ file ->getExtension () == "lm " ) {
60- $ this ->langFiles [$ file ->getBasename (".lm " )] = $ file ->getPathname ();
59+ if ( $ file ->getExtension () == "lm " ) {
60+ $ this ->langFiles [$ file ->getBasename ( ".lm " )] = $ file ->getPathname ();
6161 }
6262 }
6363 }
@@ -68,43 +68,45 @@ public function __construct($dir = null) {
6868 * @param int $maxNgrams How many ngrams to use.
6969 * @return int[]
7070 */
71- public function createLM ($ text , $ maxNgrams ) {
71+ public function createLM ( $ text , $ maxNgrams ) {
7272 $ ngram = array ();
73- foreach ( preg_split ("/[ {$ this ->wordSeparator }]+/u " , $ text ) as $ word ) {
74- if ( empty ($ word) ) {
73+ foreach ( preg_split ( "/[ {$ this ->wordSeparator }]+/u " , $ text ) as $ word ) {
74+ if ( empty ( $ word ) ) {
7575 continue ;
7676 }
7777 $ word = "_ " .$ word ."_ " ;
78- $ len = mb_strlen ($ word , "UTF-8 " );
79- for ( $ i =0 ;$ i <$ len ;$ i ++) {
78+ $ len = mb_strlen ( $ word , "UTF-8 " );
79+ for ( $ i =0 ;$ i <$ len ;$ i ++ ) {
8080 $ rlen = $ len - $ i ;
81- if ( $ rlen > 4 ) {
82- @$ ngram [mb_substr ($ word , $ i , 5 , "UTF-8 " )]++;
81+ if ( $ rlen > 4 ) {
82+ @$ ngram [mb_substr ( $ word , $ i , 5 , "UTF-8 " )]++;
8383 }
84- if ( $ rlen > 3 ) {
85- @$ ngram [mb_substr ($ word , $ i , 4 , "UTF-8 " )]++;
84+ if ( $ rlen > 3 ) {
85+ @$ ngram [mb_substr ( $ word , $ i , 4 , "UTF-8 " )]++;
8686 }
87- if ( $ rlen > 2 ) {
88- @$ ngram [mb_substr ($ word , $ i , 3 , "UTF-8 " )]++;
87+ if ( $ rlen > 2 ) {
88+ @$ ngram [mb_substr ( $ word , $ i , 3 , "UTF-8 " )]++;
8989 }
90- if ( $ rlen > 1 ) {
91- @$ ngram [mb_substr ($ word , $ i , 2 , "UTF-8 " )]++;
90+ if ( $ rlen > 1 ) {
91+ @$ ngram [mb_substr ( $ word , $ i , 2 , "UTF-8 " )]++;
9292 }
93- @$ ngram [mb_substr ($ word , $ i , 1 , "UTF-8 " )]++;
93+ @$ ngram [mb_substr ( $ word , $ i , 1 , "UTF-8 " )]++;
9494 }
9595 }
96- if ( $ this ->minFreq ) {
96+ if ( $ this ->minFreq ) {
9797 $ min = $ this ->minFreq ;
98- $ ngram = array_filter ($ ngram , function ($ v ) use ($ min ) { return $ v > $ min ; });
98+ $ ngram = array_filter ( $ ngram , function ( $ v ) use ( $ min ) { return $ v > $ min ;
99+
100+ } );
99101 }
100- uksort ( $ ngram , function ($ k1 , $ k2 ) use ($ ngram ) {
101- if ( $ ngram [$ k1 ] == $ ngram [$ k2 ]) {
102- return strcmp ($ k1 , $ k2 );
102+ uksort ( $ ngram , function ( $ k1 , $ k2 ) use ( $ ngram ) {
103+ if ( $ ngram [$ k1 ] == $ ngram [$ k2 ] ) {
104+ return strcmp ( $ k1 , $ k2 );
103105 }
104106 return $ ngram [$ k2 ] - $ ngram [$ k1 ];
105- } );
106- if ( count ($ ngram ) > $ maxNgrams ) {
107- array_splice ($ ngram , $ maxNgrams );
107+ } );
108+ if ( count ( $ ngram ) > $ maxNgrams ) {
109+ array_splice ( $ ngram , $ maxNgrams );
108110 }
109111 return $ ngram ;
110112 }
@@ -114,9 +116,9 @@ public function createLM($text, $maxNgrams) {
114116 * @param string $langFile
115117 * @return int[] Language file data
116118 */
117- public function loadLanguageFile ($ langFile ) {
119+ public function loadLanguageFile ( $ langFile ) {
118120 include $ langFile ;
119- array_splice ($ ranks , $ this ->maxNgrams );
121+ array_splice ( $ ranks , $ this ->maxNgrams );
120122 return $ ranks ;
121123 }
122124
@@ -125,15 +127,17 @@ public function loadLanguageFile($langFile) {
125127 * @param int[] $ngrams
126128 * @param string $outfile Output filename
127129 */
128- public function writeLanguageFile ($ ngrams , $ outfile ) {
129- $ out = fopen ($ outfile , "w " );
130+ public function writeLanguageFile ( $ ngrams , $ outfile ) {
131+ $ out = fopen ( $ outfile , "w " );
130132 // write original array as "$ngrams"
131- fwrite ($ out , '<?php $ngrams = ' . var_export ($ ngrams , true ) . "; \n" );
133+ fwrite ( $ out , '<?php $ngrams = ' . var_export ( $ ngrams , true ) . "; \n" );
132134 // write reduced array as "$ranks"
133135 $ rank = 1 ;
134- $ ranks = array_map (function ($ x ) use (&$ rank ) { return $ rank ++; }, $ ngrams );
135- fwrite ($ out , '$ranks = ' . var_export ($ ranks , true ) . "; \n" );
136- fclose ($ out );
136+ $ ranks = array_map ( function ( $ x ) use ( &$ rank ) { return $ rank ++;
137+
138+ }, $ ngrams );
139+ fwrite ( $ out , '$ranks = ' . var_export ( $ ranks , true ) . "; \n" );
140+ fclose ( $ out );
137141 }
138142
139143 /**
@@ -143,29 +147,29 @@ public function writeLanguageFile($ngrams, $outfile) {
143147 * @return int[] Array with keys of language names and values of score.
144148 * Sorted by ascending score, with first result being the best.
145149 */
146- public function classify ($ text , $ candidates = null ) {
147- $ inputgrams = array_keys ($ this ->createLM ($ text , $ this ->maxNgrams ) );
148- if ( $ candidates ) {
150+ public function classify ( $ text , $ candidates = null ) {
151+ $ inputgrams = array_keys ( $ this ->createLM ( $ text , $ this ->maxNgrams ) );
152+ if ( $ candidates ) {
149153 // flip for more efficient lookups
150- $ candidates = array_flip ($ candidates );
154+ $ candidates = array_flip ( $ candidates );
151155 }
152156 $ results = array ();
153- foreach ( $ this ->langFiles as $ language => $ langFile ) {
154- if ( $ candidates && !isset ($ candidates [$ language ]) ) {
157+ foreach ( $ this ->langFiles as $ language => $ langFile ) {
158+ if ( $ candidates && !isset ( $ candidates [$ language ] ) ) {
155159 continue ;
156160 }
157- $ ngrams = $ this ->loadLanguageFile ($ langFile );
161+ $ ngrams = $ this ->loadLanguageFile ( $ langFile );
158162 $ p = 0 ;
159- foreach ( $ inputgrams as $ i => $ ingram ) {
160- if ( !empty ($ ngrams [$ ingram ]) ) {
161- $ p += abs ($ ngrams [$ ingram ] - $ i );
163+ foreach ( $ inputgrams as $ i => $ ingram ) {
164+ if ( !empty ( $ ngrams [$ ingram ] ) ) {
165+ $ p += abs ( $ ngrams [$ ingram ] - $ i );
162166 } else {
163167 $ p += $ this ->maxNgrams ;
164168 }
165169 }
166170 $ results [$ language ] = $ p ;
167171 }
168- asort ($ results );
172+ asort ( $ results );
169173 return $ results ;
170174 }
171175}
0 commit comments