|
| 1 | +#!/usr/bin/env perl |
| 2 | + |
| 3 | +# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $ |
| 4 | +# Sample De-Tokenizer |
| 5 | +# written by Josh Schroeder, based on code by Philipp Koehn |
| 6 | +# further modifications by Ondrej Bojar |
| 7 | +# |
| 8 | +# This file is part of moses. Its use is licensed under the GNU Lesser General |
| 9 | +# Public License version 2.1 or, at your option, any later version. |
| 10 | + |
| 11 | +binmode(STDIN, ":utf8"); |
| 12 | +binmode(STDOUT, ":utf8"); |
| 13 | + |
| 14 | +use warnings; |
| 15 | +use strict; |
| 16 | +use utf8; # tell perl this script file is in UTF-8 (see all funny punct below) |
| 17 | + |
| 18 | +my $language = "en"; |
| 19 | +my $QUIET = 0; |
| 20 | +my $HELP = 0; |
| 21 | +my $UPPERCASE_SENT = 0; |
| 22 | +my $PENN = 0; |
| 23 | + |
| 24 | +while (@ARGV) { |
| 25 | + $_ = shift; |
| 26 | + /^-b$/ && ($| = 1, next); |
| 27 | + /^-l$/ && ($language = shift, next); |
| 28 | + /^-q$/ && ($QUIET = 1, next); |
| 29 | + /^-h$/ && ($HELP = 1, next); |
| 30 | + /^-u$/ && ($UPPERCASE_SENT = 1, next); |
| 31 | + /^-penn$/ && ($PENN = 1, next); |
| 32 | +} |
| 33 | + |
| 34 | +if ($HELP) { |
| 35 | + print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n"; |
| 36 | + print "Options:\n"; |
| 37 | + print " -u ... uppercase the first char in the final sentence.\n"; |
| 38 | + print " -q ... don't report detokenizer revision.\n"; |
| 39 | + print " -b ... disable Perl buffering.\n"; |
| 40 | + print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n"; |
| 41 | + exit; |
| 42 | +} |
| 43 | + |
| 44 | +if ($language !~ /^(cs|en|fr|it|fi)$/) { |
| 45 | + print STDERR "Warning: No built-in rules for language $language.\n" |
| 46 | +} |
| 47 | + |
| 48 | +if ($PENN && $language ne "en") { |
| 49 | + print STDERR "Error: -penn option only supported for English text.\n"; |
| 50 | + exit; |
| 51 | +} |
| 52 | + |
| 53 | +if (!$QUIET) { |
| 54 | + print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n"; |
| 55 | + print STDERR "Language: $language\n"; |
| 56 | +} |
| 57 | + |
| 58 | +while(<STDIN>) { |
| 59 | + if (/^<.+>$/ || /^\s*$/) { |
| 60 | + #don't try to detokenize XML/HTML tag lines |
| 61 | + print $_; |
| 62 | + } elsif ($PENN) { |
| 63 | + print &detokenize_penn($_); |
| 64 | + } else { |
| 65 | + print &detokenize($_); |
| 66 | + } |
| 67 | +} |
| 68 | + |
| 69 | + |
| 70 | +sub ucsecondarg { |
| 71 | + # uppercase the second argument |
| 72 | + my $arg1 = shift; |
| 73 | + my $arg2 = shift; |
| 74 | + return $arg1.uc($arg2); |
| 75 | +} |
| 76 | + |
| 77 | +sub deescape { |
| 78 | + # de-escape special chars |
| 79 | + my ($text) = @_; |
| 80 | + $text =~ s/\&bar;/\|/g; # factor separator (legacy) |
| 81 | + $text =~ s/\|/\|/g; # factor separator |
| 82 | + $text =~ s/\</\</g; # xml |
| 83 | + $text =~ s/\>/\>/g; # xml |
| 84 | + $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy) |
| 85 | + $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy) |
| 86 | + $text =~ s/\"/\"/g; # xml |
| 87 | + $text =~ s/\'/\'/g; # xml |
| 88 | + $text =~ s/\[/\[/g; # syntax non-terminal |
| 89 | + $text =~ s/\]/\]/g; # syntax non-terminal |
| 90 | + $text =~ s/\&/\&/g; # escape escape |
| 91 | + return $text; |
| 92 | +} |
| 93 | + |
| 94 | +sub detokenize { |
| 95 | + my($text) = @_; |
| 96 | + chomp($text); |
| 97 | + $text = " $text "; |
| 98 | + $text =~ s/ \@\-\@ /-/g; |
| 99 | + $text = &deescape($text); |
| 100 | + |
| 101 | + my $word; |
| 102 | + my $i; |
| 103 | + my @words = split(/ /,$text); |
| 104 | + $text = ""; |
| 105 | + my %quoteCount = ("\'"=>0,"\""=>0); |
| 106 | + my $prependSpace = " "; |
| 107 | + for ($i=0;$i<(scalar(@words));$i++) { |
| 108 | + if (&startsWithCJKChar($words[$i])) { |
| 109 | + if (($i > 0 && &endsWithCJKChar($words[$i-1])) && ($language ne "ko")) { |
| 110 | + # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word |
| 111 | + $text=$text.$words[$i]; |
| 112 | + } else { |
| 113 | + # ... but do nothing special if this is a CJK word that doesn't follow a CJK word |
| 114 | + $text=$text.$prependSpace.$words[$i]; |
| 115 | + } |
| 116 | + $prependSpace = " "; |
| 117 | + } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { |
| 118 | + #perform right shift on currency and other random punctuation items |
| 119 | + $text = $text.$prependSpace.$words[$i]; |
| 120 | + $prependSpace = ""; |
| 121 | + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ |
| 122 | + if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) { |
| 123 | + #these punctuations are prefixed with a non-breakable space in french |
| 124 | + $text .= " "; } |
| 125 | + #perform left shift on punctuation items |
| 126 | + $text=$text.$words[$i]; |
| 127 | + $prependSpace = " "; |
| 128 | + } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { |
| 129 | + #left-shift the contraction for English |
| 130 | + $text=$text.$words[$i]; |
| 131 | + $prependSpace = " "; |
| 132 | + } elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) { |
| 133 | + #left-shift floats in Czech |
| 134 | + $text=$text.$words[$i]; |
| 135 | + $prependSpace = " "; |
| 136 | + } elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { |
| 137 | + #right-shift the contraction for French and Italian |
| 138 | + $text = $text.$prependSpace.$words[$i]; |
| 139 | + $prependSpace = ""; |
| 140 | + } elsif (($language eq "cs") && ($i<(scalar(@words)-3)) |
| 141 | + && ($words[$i] =~ /[\p{IsAlpha}]$/) |
| 142 | + && ($words[$i+1] =~ /^[-–]$/) |
| 143 | + && ($words[$i+2] =~ /^li$|^mail.*/i) |
| 144 | + ) { |
| 145 | + #right-shift "-li" in Czech and a few Czech dashed words (e-mail) |
| 146 | + $text = $text.$prependSpace.$words[$i].$words[$i+1]; |
| 147 | + $i++; # advance over the dash |
| 148 | + $prependSpace = ""; |
| 149 | + } elsif ($words[$i] =~ /^[\'\"„“`]+$/) { |
| 150 | + #combine punctuation smartly |
| 151 | + my $normalized_quo = $words[$i]; |
| 152 | + $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/; |
| 153 | + $quoteCount{$normalized_quo} = 0 |
| 154 | + if !defined $quoteCount{$normalized_quo}; |
| 155 | + if ($language eq "cs" && $words[$i] eq "„") { |
| 156 | + # this is always the starting quote in Czech |
| 157 | + $quoteCount{$normalized_quo} = 0; |
| 158 | + } |
| 159 | + if ($language eq "cs" && $words[$i] eq "“") { |
| 160 | + # this is usually the ending quote in Czech |
| 161 | + $quoteCount{$normalized_quo} = 1; |
| 162 | + } |
| 163 | + if (($quoteCount{$normalized_quo} % 2) eq 0) { |
| 164 | + if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) { |
| 165 | + #single quote for posesssives ending in s... "The Jones' house" |
| 166 | + #left shift |
| 167 | + $text=$text.$words[$i]; |
| 168 | + $prependSpace = " "; |
| 169 | + } else { |
| 170 | + #right shift |
| 171 | + $text = $text.$prependSpace.$words[$i]; |
| 172 | + $prependSpace = ""; |
| 173 | + $quoteCount{$normalized_quo} ++; |
| 174 | + |
| 175 | + } |
| 176 | + } else { |
| 177 | + #left shift |
| 178 | + $text=$text.$words[$i]; |
| 179 | + $prependSpace = " "; |
| 180 | + $quoteCount{$normalized_quo} ++; |
| 181 | + |
| 182 | + } |
| 183 | + |
| 184 | + } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) { |
| 185 | + # Finnish : without intervening space if followed by case suffix |
| 186 | + # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ... |
| 187 | + $text=$text. lc $words[$i]; |
| 188 | + $prependSpace = " "; |
| 189 | + } else { |
| 190 | + $text=$text.$prependSpace.$words[$i]; |
| 191 | + $prependSpace = " "; |
| 192 | + } |
| 193 | + } |
| 194 | + |
| 195 | + # clean up spaces at head and tail of each line as well as any double-spacing |
| 196 | + $text =~ s/ +/ /g; |
| 197 | + $text =~ s/\n /\n/g; |
| 198 | + $text =~ s/ \n/\n/g; |
| 199 | + $text =~ s/^ //g; |
| 200 | + $text =~ s/ $//g; |
| 201 | + |
| 202 | + #add trailing break |
| 203 | + $text .= "\n" unless $text =~ /\n$/; |
| 204 | + |
| 205 | + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; |
| 206 | + |
| 207 | + return $text; |
| 208 | +} |
| 209 | + |
| 210 | +sub detokenize_penn { |
| 211 | + my($text) = @_; |
| 212 | + |
| 213 | + chomp($text); |
| 214 | + $text = " $text "; |
| 215 | + $text =~ s/ \@\-\@ /-/g; |
| 216 | + $text =~ s/ \@\/\@ /\//g; |
| 217 | + $text = &deescape($text); |
| 218 | + |
| 219 | + # merge de-contracted forms except where the second word begins with an |
| 220 | + # apostrophe (those are handled later) |
| 221 | + $text =~ s/ n't /n't /g; |
| 222 | + $text =~ s/ N'T /N'T /g; |
| 223 | + $text =~ s/ ([Cc])an not / $1annot /g; |
| 224 | + $text =~ s/ ([Dd])' ye / $1'ye /g; |
| 225 | + $text =~ s/ ([Gg])im me / $1imme /g; |
| 226 | + $text =~ s/ ([Gg])on na / $1onna /g; |
| 227 | + $text =~ s/ ([Gg])ot ta / $1otta /g; |
| 228 | + $text =~ s/ ([Ll])em me / $1emme /g; |
| 229 | + $text =~ s/ '([Tt]) is / '$1is /g; |
| 230 | + $text =~ s/ '([Tt]) was / '$1was /g; |
| 231 | + $text =~ s/ ([Ww])an na / $1anna /g; |
| 232 | + |
| 233 | + # restore brackets |
| 234 | + $text =~ s/-LRB-/\(/g; |
| 235 | + $text =~ s/-RRB-/\)/g; |
| 236 | + $text =~ s/-LSB-/\[/g; |
| 237 | + $text =~ s/-RSB-/\]/g; |
| 238 | + $text =~ s/-LCB-/{/g; |
| 239 | + $text =~ s/-RCB-/}/g; |
| 240 | + |
| 241 | + my $i; |
| 242 | + my @words = split(/ /,$text); |
| 243 | + $text = ""; |
| 244 | + my $prependSpace = " "; |
| 245 | + for ($i=0;$i<(scalar(@words));$i++) { |
| 246 | + if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { |
| 247 | + # perform right shift on currency and other random punctuation items |
| 248 | + $text = $text.$prependSpace.$words[$i]; |
| 249 | + $prependSpace = ""; |
| 250 | + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ |
| 251 | + # perform left shift on punctuation items |
| 252 | + $text=$text.$words[$i]; |
| 253 | + $prependSpace = " "; |
| 254 | + } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { |
| 255 | + # left-shift the contraction |
| 256 | + $text=$text.$words[$i]; |
| 257 | + $prependSpace = " "; |
| 258 | + } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only |
| 259 | + # opening single quote: convert to straight quote and right-shift |
| 260 | + $text = $text.$prependSpace."\'"; |
| 261 | + $prependSpace = ""; |
| 262 | + } elsif ($words[$i] eq "``") { |
| 263 | + # opening double quote: convert to straight quote and right-shift |
| 264 | + $text = $text.$prependSpace."\""; |
| 265 | + $prependSpace = ""; |
| 266 | + } elsif ($words[$i] eq "\'") { |
| 267 | + # closing single quote: convert to straight quote and left shift |
| 268 | + $text = $text."\'"; |
| 269 | + $prependSpace = " "; |
| 270 | + } elsif ($words[$i] eq "\'\'") { |
| 271 | + # closing double quote: convert to straight quote and left shift |
| 272 | + $text = $text."\""; |
| 273 | + $prependSpace = " "; |
| 274 | + } else { |
| 275 | + $text = $text.$prependSpace.$words[$i]; |
| 276 | + $prependSpace = " "; |
| 277 | + } |
| 278 | + } |
| 279 | + |
| 280 | + # clean up spaces at head and tail of each line as well as any double-spacing |
| 281 | + $text =~ s/ +/ /g; |
| 282 | + $text =~ s/\n /\n/g; |
| 283 | + $text =~ s/ \n/\n/g; |
| 284 | + $text =~ s/^ //g; |
| 285 | + $text =~ s/ $//g; |
| 286 | + |
| 287 | + # add trailing break |
| 288 | + $text .= "\n" unless $text =~ /\n$/; |
| 289 | + |
| 290 | + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; |
| 291 | + |
| 292 | + return $text; |
| 293 | +} |
| 294 | + |
| 295 | +sub startsWithCJKChar { |
| 296 | + my ($str) = @_; |
| 297 | + return 0 if length($str) == 0; |
| 298 | + my $firstChar = substr($str, 0, 1); |
| 299 | + return &charIsCJK($firstChar); |
| 300 | +} |
| 301 | + |
| 302 | +sub endsWithCJKChar { |
| 303 | + my ($str) = @_; |
| 304 | + return 0 if length($str) == 0; |
| 305 | + my $lastChar = substr($str, length($str)-1, 1); |
| 306 | + return &charIsCJK($lastChar); |
| 307 | +} |
| 308 | + |
| 309 | +# Given a string consisting of one character, returns true iff the character |
| 310 | +# is a CJK (Chinese/Japanese/Korean) character |
| 311 | +sub charIsCJK { |
| 312 | + my ($char) = @_; |
| 313 | + # $char should be a string of length 1 |
| 314 | + my $codepoint = &codepoint_dec($char); |
| 315 | + |
| 316 | + # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane |
| 317 | + |
| 318 | + # Hangul Jamo (1100–11FF) |
| 319 | + return 1 if (&between_hexes($codepoint, '1100', '11FF')); |
| 320 | + |
| 321 | + # CJK Radicals Supplement (2E80–2EFF) |
| 322 | + # Kangxi Radicals (2F00–2FDF) |
| 323 | + # Ideographic Description Characters (2FF0–2FFF) |
| 324 | + # CJK Symbols and Punctuation (3000–303F) |
| 325 | + # Hiragana (3040–309F) |
| 326 | + # Katakana (30A0–30FF) |
| 327 | + # Bopomofo (3100–312F) |
| 328 | + # Hangul Compatibility Jamo (3130–318F) |
| 329 | + # Kanbun (3190–319F) |
| 330 | + # Bopomofo Extended (31A0–31BF) |
| 331 | + # CJK Strokes (31C0–31EF) |
| 332 | + # Katakana Phonetic Extensions (31F0–31FF) |
| 333 | + # Enclosed CJK Letters and Months (3200–32FF) |
| 334 | + # CJK Compatibility (3300–33FF) |
| 335 | + # CJK Unified Ideographs Extension A (3400–4DBF) |
| 336 | + # Yijing Hexagram Symbols (4DC0–4DFF) |
| 337 | + # CJK Unified Ideographs (4E00–9FFF) |
| 338 | + # Yi Syllables (A000–A48F) |
| 339 | + # Yi Radicals (A490–A4CF) |
| 340 | + return 1 if (&between_hexes($codepoint, '2E80', 'A4CF')); |
| 341 | + |
| 342 | + # Phags-pa (A840–A87F) |
| 343 | + return 1 if (&between_hexes($codepoint, 'A840', 'A87F')); |
| 344 | + |
| 345 | + # Hangul Syllables (AC00–D7AF) |
| 346 | + return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF')); |
| 347 | + |
| 348 | + # CJK Compatibility Ideographs (F900–FAFF) |
| 349 | + return 1 if (&between_hexes($codepoint, 'F900', 'FAFF')); |
| 350 | + |
| 351 | + # CJK Compatibility Forms (FE30–FE4F) |
| 352 | + return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F')); |
| 353 | + |
| 354 | + # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters |
| 355 | + return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC')); |
| 356 | + |
| 357 | + # Supplementary Ideographic Plane 20000–2FFFF |
| 358 | + return 1 if (&between_hexes($codepoint, '20000', '2FFFF')); |
| 359 | + |
| 360 | + return 0; |
| 361 | +} |
| 362 | + |
| 363 | +# Returns the code point of a Unicode char, represented as a decimal number |
| 364 | +sub codepoint_dec { |
| 365 | + if (my $char = shift) { |
| 366 | + return unpack('U0U*', $char); |
| 367 | + } |
| 368 | +} |
| 369 | + |
| 370 | +sub between_hexes { |
| 371 | + my ($num, $left, $right) = @_; |
| 372 | + return $num >= hex($left) && $num <= hex($right); |
| 373 | +} |
0 commit comments