Skip to content
This repository was archived by the owner on Aug 3, 2021. It is now read-only.

Commit 98ad236

Browse files
authored
Merge pull request #170 from NVIDIA/dev0.4
dev0.4
2 parents 1efa595 + 35b26f1 commit 98ad236

File tree

148 files changed

+11984
-3599
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

148 files changed

+11984
-3599
lines changed

detokenizer.perl

Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
#!/usr/bin/env perl
2+
3+
# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
4+
# Sample De-Tokenizer
5+
# written by Josh Schroeder, based on code by Philipp Koehn
6+
# further modifications by Ondrej Bojar
7+
#
8+
# This file is part of moses. Its use is licensed under the GNU Lesser General
9+
# Public License version 2.1 or, at your option, any later version.
10+
11+
binmode(STDIN, ":utf8");
12+
binmode(STDOUT, ":utf8");
13+
14+
use warnings;
15+
use strict;
16+
use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
17+
18+
my $language = "en";
19+
my $QUIET = 0;
20+
my $HELP = 0;
21+
my $UPPERCASE_SENT = 0;
22+
my $PENN = 0;
23+
24+
while (@ARGV) {
25+
$_ = shift;
26+
/^-b$/ && ($| = 1, next);
27+
/^-l$/ && ($language = shift, next);
28+
/^-q$/ && ($QUIET = 1, next);
29+
/^-h$/ && ($HELP = 1, next);
30+
/^-u$/ && ($UPPERCASE_SENT = 1, next);
31+
/^-penn$/ && ($PENN = 1, next);
32+
}
33+
34+
if ($HELP) {
35+
print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
36+
print "Options:\n";
37+
print " -u ... uppercase the first char in the final sentence.\n";
38+
print " -q ... don't report detokenizer revision.\n";
39+
print " -b ... disable Perl buffering.\n";
40+
print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
41+
exit;
42+
}
43+
44+
if ($language !~ /^(cs|en|fr|it|fi)$/) {
45+
print STDERR "Warning: No built-in rules for language $language.\n"
46+
}
47+
48+
if ($PENN && $language ne "en") {
49+
print STDERR "Error: -penn option only supported for English text.\n";
50+
exit;
51+
}
52+
53+
if (!$QUIET) {
54+
print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
55+
print STDERR "Language: $language\n";
56+
}
57+
58+
while(<STDIN>) {
59+
if (/^<.+>$/ || /^\s*$/) {
60+
#don't try to detokenize XML/HTML tag lines
61+
print $_;
62+
} elsif ($PENN) {
63+
print &detokenize_penn($_);
64+
} else {
65+
print &detokenize($_);
66+
}
67+
}
68+
69+
70+
sub ucsecondarg {
71+
# uppercase the second argument
72+
my $arg1 = shift;
73+
my $arg2 = shift;
74+
return $arg1.uc($arg2);
75+
}
76+
77+
sub deescape {
78+
# de-escape special chars
79+
my ($text) = @_;
80+
$text =~ s/\&bar;/\|/g; # factor separator (legacy)
81+
$text =~ s/\&#124;/\|/g; # factor separator
82+
$text =~ s/\&lt;/\</g; # xml
83+
$text =~ s/\&gt;/\>/g; # xml
84+
$text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy)
85+
$text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy)
86+
$text =~ s/\&quot;/\"/g; # xml
87+
$text =~ s/\&apos;/\'/g; # xml
88+
$text =~ s/\&#91;/\[/g; # syntax non-terminal
89+
$text =~ s/\&#93;/\]/g; # syntax non-terminal
90+
$text =~ s/\&amp;/\&/g; # escape escape
91+
return $text;
92+
}
93+
94+
sub detokenize {
95+
my($text) = @_;
96+
chomp($text);
97+
$text = " $text ";
98+
$text =~ s/ \@\-\@ /-/g;
99+
$text = &deescape($text);
100+
101+
my $word;
102+
my $i;
103+
my @words = split(/ /,$text);
104+
$text = "";
105+
my %quoteCount = ("\'"=>0,"\""=>0);
106+
my $prependSpace = " ";
107+
for ($i=0;$i<(scalar(@words));$i++) {
108+
if (&startsWithCJKChar($words[$i])) {
109+
if (($i > 0 && &endsWithCJKChar($words[$i-1])) && ($language ne "ko")) {
110+
# perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
111+
$text=$text.$words[$i];
112+
} else {
113+
# ... but do nothing special if this is a CJK word that doesn't follow a CJK word
114+
$text=$text.$prependSpace.$words[$i];
115+
}
116+
$prependSpace = " ";
117+
} elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
118+
#perform right shift on currency and other random punctuation items
119+
$text = $text.$prependSpace.$words[$i];
120+
$prependSpace = "";
121+
} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
122+
if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) {
123+
#these punctuations are prefixed with a non-breakable space in french
124+
$text .= " "; }
125+
#perform left shift on punctuation items
126+
$text=$text.$words[$i];
127+
$prependSpace = " ";
128+
} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
129+
#left-shift the contraction for English
130+
$text=$text.$words[$i];
131+
$prependSpace = " ";
132+
} elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
133+
#left-shift floats in Czech
134+
$text=$text.$words[$i];
135+
$prependSpace = " ";
136+
} elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
137+
#right-shift the contraction for French and Italian
138+
$text = $text.$prependSpace.$words[$i];
139+
$prependSpace = "";
140+
} elsif (($language eq "cs") && ($i<(scalar(@words)-3))
141+
&& ($words[$i] =~ /[\p{IsAlpha}]$/)
142+
&& ($words[$i+1] =~ /^[-–]$/)
143+
&& ($words[$i+2] =~ /^li$|^mail.*/i)
144+
) {
145+
#right-shift "-li" in Czech and a few Czech dashed words (e-mail)
146+
$text = $text.$prependSpace.$words[$i].$words[$i+1];
147+
$i++; # advance over the dash
148+
$prependSpace = "";
149+
} elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
150+
#combine punctuation smartly
151+
my $normalized_quo = $words[$i];
152+
$normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
153+
$quoteCount{$normalized_quo} = 0
154+
if !defined $quoteCount{$normalized_quo};
155+
if ($language eq "cs" && $words[$i] eq "") {
156+
# this is always the starting quote in Czech
157+
$quoteCount{$normalized_quo} = 0;
158+
}
159+
if ($language eq "cs" && $words[$i] eq "") {
160+
# this is usually the ending quote in Czech
161+
$quoteCount{$normalized_quo} = 1;
162+
}
163+
if (($quoteCount{$normalized_quo} % 2) eq 0) {
164+
if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
165+
#single quote for posesssives ending in s... "The Jones' house"
166+
#left shift
167+
$text=$text.$words[$i];
168+
$prependSpace = " ";
169+
} else {
170+
#right shift
171+
$text = $text.$prependSpace.$words[$i];
172+
$prependSpace = "";
173+
$quoteCount{$normalized_quo} ++;
174+
175+
}
176+
} else {
177+
#left shift
178+
$text=$text.$words[$i];
179+
$prependSpace = " ";
180+
$quoteCount{$normalized_quo} ++;
181+
182+
}
183+
184+
} elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
185+
# Finnish : without intervening space if followed by case suffix
186+
# EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
187+
$text=$text. lc $words[$i];
188+
$prependSpace = " ";
189+
} else {
190+
$text=$text.$prependSpace.$words[$i];
191+
$prependSpace = " ";
192+
}
193+
}
194+
195+
# clean up spaces at head and tail of each line as well as any double-spacing
196+
$text =~ s/ +/ /g;
197+
$text =~ s/\n /\n/g;
198+
$text =~ s/ \n/\n/g;
199+
$text =~ s/^ //g;
200+
$text =~ s/ $//g;
201+
202+
#add trailing break
203+
$text .= "\n" unless $text =~ /\n$/;
204+
205+
$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
206+
207+
return $text;
208+
}
209+
210+
sub detokenize_penn {
211+
my($text) = @_;
212+
213+
chomp($text);
214+
$text = " $text ";
215+
$text =~ s/ \@\-\@ /-/g;
216+
$text =~ s/ \@\/\@ /\//g;
217+
$text = &deescape($text);
218+
219+
# merge de-contracted forms except where the second word begins with an
220+
# apostrophe (those are handled later)
221+
$text =~ s/ n't /n't /g;
222+
$text =~ s/ N'T /N'T /g;
223+
$text =~ s/ ([Cc])an not / $1annot /g;
224+
$text =~ s/ ([Dd])' ye / $1'ye /g;
225+
$text =~ s/ ([Gg])im me / $1imme /g;
226+
$text =~ s/ ([Gg])on na / $1onna /g;
227+
$text =~ s/ ([Gg])ot ta / $1otta /g;
228+
$text =~ s/ ([Ll])em me / $1emme /g;
229+
$text =~ s/ '([Tt]) is / '$1is /g;
230+
$text =~ s/ '([Tt]) was / '$1was /g;
231+
$text =~ s/ ([Ww])an na / $1anna /g;
232+
233+
# restore brackets
234+
$text =~ s/-LRB-/\(/g;
235+
$text =~ s/-RRB-/\)/g;
236+
$text =~ s/-LSB-/\[/g;
237+
$text =~ s/-RSB-/\]/g;
238+
$text =~ s/-LCB-/{/g;
239+
$text =~ s/-RCB-/}/g;
240+
241+
my $i;
242+
my @words = split(/ /,$text);
243+
$text = "";
244+
my $prependSpace = " ";
245+
for ($i=0;$i<(scalar(@words));$i++) {
246+
if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
247+
# perform right shift on currency and other random punctuation items
248+
$text = $text.$prependSpace.$words[$i];
249+
$prependSpace = "";
250+
} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
251+
# perform left shift on punctuation items
252+
$text=$text.$words[$i];
253+
$prependSpace = " ";
254+
} elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
255+
# left-shift the contraction
256+
$text=$text.$words[$i];
257+
$prependSpace = " ";
258+
} elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
259+
# opening single quote: convert to straight quote and right-shift
260+
$text = $text.$prependSpace."\'";
261+
$prependSpace = "";
262+
} elsif ($words[$i] eq "``") {
263+
# opening double quote: convert to straight quote and right-shift
264+
$text = $text.$prependSpace."\"";
265+
$prependSpace = "";
266+
} elsif ($words[$i] eq "\'") {
267+
# closing single quote: convert to straight quote and left shift
268+
$text = $text."\'";
269+
$prependSpace = " ";
270+
} elsif ($words[$i] eq "\'\'") {
271+
# closing double quote: convert to straight quote and left shift
272+
$text = $text."\"";
273+
$prependSpace = " ";
274+
} else {
275+
$text = $text.$prependSpace.$words[$i];
276+
$prependSpace = " ";
277+
}
278+
}
279+
280+
# clean up spaces at head and tail of each line as well as any double-spacing
281+
$text =~ s/ +/ /g;
282+
$text =~ s/\n /\n/g;
283+
$text =~ s/ \n/\n/g;
284+
$text =~ s/^ //g;
285+
$text =~ s/ $//g;
286+
287+
# add trailing break
288+
$text .= "\n" unless $text =~ /\n$/;
289+
290+
$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
291+
292+
return $text;
293+
}
294+
295+
sub startsWithCJKChar {
296+
my ($str) = @_;
297+
return 0 if length($str) == 0;
298+
my $firstChar = substr($str, 0, 1);
299+
return &charIsCJK($firstChar);
300+
}
301+
302+
sub endsWithCJKChar {
303+
my ($str) = @_;
304+
return 0 if length($str) == 0;
305+
my $lastChar = substr($str, length($str)-1, 1);
306+
return &charIsCJK($lastChar);
307+
}
308+
309+
# Given a string consisting of one character, returns true iff the character
310+
# is a CJK (Chinese/Japanese/Korean) character
311+
sub charIsCJK {
312+
my ($char) = @_;
313+
# $char should be a string of length 1
314+
my $codepoint = &codepoint_dec($char);
315+
316+
# The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
317+
318+
# Hangul Jamo (1100–11FF)
319+
return 1 if (&between_hexes($codepoint, '1100', '11FF'));
320+
321+
# CJK Radicals Supplement (2E80–2EFF)
322+
# Kangxi Radicals (2F00–2FDF)
323+
# Ideographic Description Characters (2FF0–2FFF)
324+
# CJK Symbols and Punctuation (3000–303F)
325+
# Hiragana (3040–309F)
326+
# Katakana (30A0–30FF)
327+
# Bopomofo (3100–312F)
328+
# Hangul Compatibility Jamo (3130–318F)
329+
# Kanbun (3190–319F)
330+
# Bopomofo Extended (31A0–31BF)
331+
# CJK Strokes (31C0–31EF)
332+
# Katakana Phonetic Extensions (31F0–31FF)
333+
# Enclosed CJK Letters and Months (3200–32FF)
334+
# CJK Compatibility (3300–33FF)
335+
# CJK Unified Ideographs Extension A (3400–4DBF)
336+
# Yijing Hexagram Symbols (4DC0–4DFF)
337+
# CJK Unified Ideographs (4E00–9FFF)
338+
# Yi Syllables (A000–A48F)
339+
# Yi Radicals (A490–A4CF)
340+
return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
341+
342+
# Phags-pa (A840–A87F)
343+
return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
344+
345+
# Hangul Syllables (AC00–D7AF)
346+
return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
347+
348+
# CJK Compatibility Ideographs (F900–FAFF)
349+
return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
350+
351+
# CJK Compatibility Forms (FE30–FE4F)
352+
return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
353+
354+
# Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
355+
return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
356+
357+
# Supplementary Ideographic Plane 20000–2FFFF
358+
return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
359+
360+
return 0;
361+
}
362+
363+
# Returns the code point of a Unicode char, represented as a decimal number
364+
sub codepoint_dec {
365+
if (my $char = shift) {
366+
return unpack('U0U*', $char);
367+
}
368+
}
369+
370+
sub between_hexes {
371+
my ($num, $left, $right) = @_;
372+
return $num >= hex($left) && $num <= hex($right);
373+
}

0 commit comments

Comments
 (0)