Skip to content

Commit 2aadb1d

Browse files
committed
Add ability to specify a minimum input length
Strings that are too short will not be identified. Fixed an unrelated typo.
1 parent cd0aff5 commit 2aadb1d

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

text_cat

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@ use open qw(:std :utf8);
2525

2626
use strict;
2727
use vars
28-
qw( $opt_a $opt_d $opt_f $opt_h $opt_i $opt_l $opt_L $opt_m $opt_n $opt_s $opt_t $opt_u $opt_w );
28+
qw( $opt_a $opt_d $opt_f $opt_h $opt_i $opt_j $opt_l $opt_L $opt_m $opt_n $opt_s $opt_t $opt_u $opt_w );
2929
use Getopt::Std;
3030

3131
# OPTIONS
32-
getopts('a:d:f:hi:lL:m:nst:u:w:');
32+
getopts('a:d:f:hi:j:lL:m:nst:u:w:');
3333

3434
my $max_returned_langs = $opt_a || 10;
3535
my $model_dir = $opt_d || './LM';
@@ -43,6 +43,7 @@ my $line_by_line = $opt_s;
4343
my $max_input_ngrams = $opt_t || 500;
4444
my $max_result_ratio = $opt_u || 1.05;
4545
my $non_word_characters = $opt_w || '0-9\s()';
46+
my $min_input_length = $opt_j || 0;
4647

4748
sub help {
4849
print <<HELP
@@ -57,7 +58,7 @@ $0 -h
5758
5859
* for guessing:
5960
60-
$0 [-a Int] [-d Dir] [-f Int] [-i Int] [-l] [-t Int] [-m Int] [-u Int]
61+
$0 [-a Int] [-d Dir] [-f Int] [-i Int] [-j Int] [-l] [-t Int] [-m Int] [-u Int]
6162
6263
-a The program returns the best-scoring language together
6364
with all languages which are $max_result_ratio times worse (cf option -u).
@@ -77,6 +78,9 @@ $0 [-a Int] [-d Dir] [-f Int] [-i Int] [-l] [-t Int] [-m Int] [-u Int]
7778
7879
-i Only read the specified number of lines of the input.
7980
81+
-j Only attempt classification if the input string is at least this
82+
many characters.
83+
8084
-l Indicates that input is given as an argument on the command line,
8185
e.g. text_cat -m 2000 -l "this is english text"
8286
Cannot be used in combination with -n.
@@ -131,7 +135,7 @@ if ( !$create_model ) {
131135
push ( @languages,
132136
# lang model must exist and be readable, be compatible with the user supplied
133137
# language list, and not have been found in a previous directory; map lang code
134-
# to path as a side effect o ftracking seen codes
138+
# to path as a side effect of tracking seen codes
135139
grep { s/\.lm// && -r "$dir/$_.lm" &&
136140
( !$language_list || $user_langs{$_} ) &&
137141
( !$lang_path{$_} && ($lang_path{$_} = "$dir/$_" ) ) } readdir( DIR ) );
@@ -190,6 +194,12 @@ sub classify {
190194
my %results = ();
191195
my $maxp = $model_size;
192196

197+
if (length($input) < $min_input_length) {
198+
print
199+
"I don't know. Input is too short.\n";
200+
return;
201+
}
202+
193203
# create n-grams for input. Note that hash %unknown is not used;
194204
# it contains the actual counts which are only used under -n: creating
195205
# new language model (and even then they are not really required).

0 commit comments

Comments
 (0)