@@ -25,11 +25,11 @@ use open qw(:std :utf8);
2525
2626use strict;
2727use vars
28- qw( $opt_a $opt_d $opt_f $opt_h $opt_i $opt_l $opt_L $opt_m $opt_n $opt_s $opt_t $opt_u $opt_w ) ;
28+ qw( $opt_a $opt_d $opt_f $opt_h $opt_i $opt_j $ opt_l $opt_L $opt_m $opt_n $opt_s $opt_t $opt_u $opt_w ) ;
2929use Getopt::Std;
3030
3131# OPTIONS
32- getopts(' a:d:f:hi:lL:m:nst:u:w:' );
32+ getopts(' a:d:f:hi:j: lL:m:nst:u:w:' );
3333
3434my $max_returned_langs = $opt_a || 10;
3535my $model_dir = $opt_d || ' ./LM' ;
@@ -43,6 +43,7 @@ my $line_by_line = $opt_s;
4343my $max_input_ngrams = $opt_t || 500;
4444my $max_result_ratio = $opt_u || 1.05;
4545my $non_word_characters = $opt_w || ' 0-9\s()' ;
46+ my $min_input_length = $opt_j || 0;
4647
4748sub help {
4849 print <<HELP
5758
5859* for guessing:
5960
60- $0 [-a Int] [-d Dir] [-f Int] [-i Int] [-l] [-t Int] [-m Int] [-u Int]
61+ $0 [-a Int] [-d Dir] [-f Int] [-i Int] [-j Int] [- l] [-t Int] [-m Int] [-u Int]
6162
6263 -a The program returns the best-scoring language together
6364 with all languages which are $max_result_ratio times worse (cf option -u).
@@ -77,6 +78,9 @@ $0 [-a Int] [-d Dir] [-f Int] [-i Int] [-l] [-t Int] [-m Int] [-u Int]
7778
7879 -i Only read the specified number of lines of the input.
7980
81+ -j Only attempt classification if the input string is at least this
82+ many characters.
83+
8084 -l Indicates that input is given as an argument on the command line,
8185 e.g. text_cat -m 2000 -l "this is english text"
8286 Cannot be used in combination with -n.
@@ -131,7 +135,7 @@ if ( !$create_model ) {
131135 push ( @languages ,
132136 # lang model must exist and be readable, be compatible with the user supplied
133137 # language list, and not have been found in a previous directory; map lang code
134- # to path as a side effect o ftracking seen codes
138+ # to path as a side effect of tracking seen codes
135139 grep { s /\. lm// && -r " $dir /$_ .lm" &&
136140 ( !$language_list || $user_langs {$_ } ) &&
137141 ( !$lang_path {$_ } && ($lang_path {$_ } = " $dir /$_ " ) ) } readdir ( DIR ) );
@@ -190,6 +194,12 @@ sub classify {
190194 my %results = ();
191195 my $maxp = $model_size ;
192196
197+ if (length ($input ) < $min_input_length ) {
198+ print
199+ " I don't know. Input is too short.\n " ;
200+ return ;
201+ }
202+
193203 # create n-grams for input. Note that hash %unknown is not used;
194204 # it contains the actual counts which are only used under -n: creating
195205 # new language model (and even then they are not really required).
0 commit comments