Skip to content

Commit 9423e7d

Browse files
committed
Add ability to boost liklihood of specific languages
Add the ability to specify a list of languages to have their scores boosted, and an amount (as a percentage) to boost them. Tidy up and fix errors in help text.
1 parent a448652 commit 9423e7d

File tree

2 files changed

+34
-19
lines changed

2 files changed

+34
-19
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Updates from the original version include:
2525
* Allow specification of a minimum input length (-j); shorter strings will not be identified. Mininimum length does not count non-word characters.
2626
* Allow specification of a maximum proportion of highest (i.e., worst) possible score (-p), to filter "junk" texts mostly made of unknown characters and n-grams, and to a lesser extent texts in languages that are not even similar to the models in use.
2727
* Merge n-gram count for input text and language model size to one shared value.
28+
* Allow boosting of particular languages in results (based, for example, on a priori knowledge of the likelihood of various languages being present).
2829

2930
## Classification and Model Generation
3031

text_cat

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ use Encode qw(decode);
2626

2727
use strict;
2828
use vars
29-
qw( $opt_a $opt_d $opt_f $opt_h $opt_i $opt_j $opt_l $opt_L $opt_m $opt_n $opt_p $opt_s $opt_u $opt_w );
29+
qw( $opt_a $opt_b $opt_B $opt_d $opt_f $opt_h $opt_i $opt_j $opt_l $opt_L $opt_m $opt_n $opt_p $opt_s $opt_u $opt_w );
3030
use Getopt::Std;
3131

3232
# OPTIONS
33-
getopts('a:d:f:hi:j:l:L:m:np:su:w:');
33+
getopts('a:b:B:d:f:hi:j:l:L:m:np:su:w:');
3434

3535
my $max_returned_langs = $opt_a || 10;
3636
my $model_dir = $opt_d || './LM';
@@ -45,6 +45,9 @@ my $max_result_ratio = $opt_u || 1.05;
4545
my $non_word_characters = decode( "utf-8", $opt_w ) || '0-9\s()';
4646
my $min_input_length = $opt_j || 0;
4747
my $max_proportion = $opt_p || 1.00;
48+
my $boosted_langs = $opt_B || '';
49+
my $boost_score = $opt_b || 0;
50+
4851

4952
sub help {
5053
print <<HELP
@@ -59,30 +62,36 @@ $0 -h
5962
6063
* for guessing:
6164
62-
$0 [-d Dir] [-c Lang] [-a Int] [-u Float] [-l Text]
65+
$0 [-d Dir] [-L Lang] [-a Int] [-u Float] [-l Text]
6366
[-f Int] [-j Int] [-m Int] [-p Float] [-w String]
67+
[-b Float -B Lang] [-s] [-i]
68+
69+
-a The program returns the best-scoring language together with
70+
all languages which are $max_result_ratio times worse (cf
71+
option -u). If the number of languages to be printed is larger
72+
than the value of this option (default: $max_returned_langs)
73+
then no language is returned, but instead a message that the
74+
input is of an unknown language is printed. Default: 10
6475
65-
-a The program returns the best-scoring language together
66-
with all languages which are $max_result_ratio times worse (cf option -u).
67-
If the number of languages to be printed is larger than the value
68-
of this option (default: $max_returned_langs) then no language is returned, but
69-
instead a message that the input is of an unknown language is
70-
printed. Default: $max_returned_langs.
76+
-b Boost to apply to languages specified by -B. Default: 0
7177
72-
-d Indicates in which directories the language models are
73-
located (files ending in .lm). Multiple directories can be separated
74-
by a comma, and will be used in order. Default: $model_dir.
78+
-B Comma-separated list of languages to boost by amount specified
79+
by -b. Default: none
80+
81+
-d Indicates in which directories the language models are located
82+
(files ending in .lm). Multiple directories can be separated by
83+
a comma, and will be used in order. Default: ./LM
7584
7685
-f Before sorting is performed the n-grams which occur this number
7786
of times or fewer are removed. This can be used to speed up
7887
the program for longer inputs. For short inputs you should use
79-
-f 0. Default: $ngram_min_freq.
88+
-f 0. Default: 0
8089
8190
-i Only read the specified number of lines of the input.
8291
8392
-j Only attempt classification if the input string is at least this
8493
many characters (not counting non-word characters, see -w).
85-
Default: 0.
94+
Default: 0
8695
8796
-l Indicates that input is given as an argument on the command line,
8897
e.g. text_cat -m 2000 -l "this is english text"
@@ -100,13 +109,13 @@ $0 [-d Dir] [-c Lang] [-a Int] [-u Float] [-l Text]
100109
101110
-m Indicates the maximum number of n-grams from each language model
102111
that should be used. Use 0 for "all". Typical value: > 500 for
103-
longer texts, > 2000 for very short texts. Default: $model_size.
112+
longer texts, > 2000 for very short texts. Default: 500
104113
105114
-u Determines how much worse result must be in order not to be
106-
mentioned as an alternative. Typical value: 1.05 or 1.1.
107-
Default: $max_result_ratio.
115+
mentioned as an alternative. Typical value: 1.05 or 1.10.
116+
Default: 1.05
108117
109-
-w Regex for non-word characters. Default: '$non_word_characters'.
118+
-w Regex for non-word characters. Default: '0-9\\s()'
110119
111120
112121
* for creating new language model, based on text read from standard input:
@@ -123,6 +132,7 @@ if ($opt_h) { help(); exit 0; }
123132
my @languages = ();
124133
my @model_dirs = split( /[, ]+/, $model_dir );
125134
my %user_langs = map {$_ => 1} split( /[, ]+/, $language_list );
135+
my %boost = map {$_ => $boost_score} split( /[, ]+/, $boosted_langs );
126136
my %all_ngram = ();
127137

128138
# pre-load language models
@@ -174,7 +184,7 @@ if ( !$create_model ) {
174184
if ($create_model) {
175185
my %ngram = ();
176186
my @result = create_lm( input(), \%ngram );
177-
print join( "\n", map { "$_\t $ngram{$_}"; } @result ), "\n";
187+
print join( "\n", map { "$_\t$ngram{$_}"; } @result ), "\n";
178188
}
179189
elsif ($read_from_cl) {
180190
classify( $read_from_cl );
@@ -223,6 +233,10 @@ sub classify {
223233
}
224234

225235
$results{$language} = $p;
236+
237+
if ( $boost{$language} ) {
238+
$results{$language} = int( $results{$language} * ( 1 - $boost{$language} ) + 0.5 );
239+
}
226240
}
227241

228242
my @results =

0 commit comments

Comments
 (0)