-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathclassencode.cpp
More file actions
188 lines (173 loc) · 7.7 KB
/
classencode.cpp
File metadata and controls
188 lines (173 loc) · 7.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#include <string>
#include <iostream>
#include <getopt.h>
#include "classencoder.h"
#include "common.h"
#include "config.h"
/*****************************
* Colibri Core
* by Maarten van Gompel
* Centre for Language Studies
* Radboud University Nijmegen
*
* http://proycon.github.io/colibri-core
*
* Licensed under GPLv3
*****************************/
using namespace std;
void usage() {
cerr << "Colibri Core " << VERSION << " - Class Encoder" << endl;
cerr << " by Maarten van Gompel, Language Machines, Centre for Language Studies, Radboud University Nijmegen" << endl;
cerr << " https://proycon.github.io/colibri-core" << endl << endl;
cerr << "Syntax: colibri-classencode [ -c classmodel ] corpus [corpus2 etc..]" << endl;
cerr << "Description: Encodes a corpus. If used with -c, encodes a corpus according to the specified pre-existing class model" << endl << endl;
cerr << "The corpus file should be in one of the following formats:" << endl;
cerr << " - plain text, preferably tokenised (tokens space delimited)" << endl;
cerr << " one sentence per line, unix newlines, encoding agnostic." << endl;
cerr << " - as above, but bzip2 compressed (bz2 extension)" << endl;
#ifdef WITHFO
cerr << " - FoLiA XML (xml extension)" << endl << endl;
#endif
cerr << "Options: -o outputprefix for class file" << endl;
cerr << " -d output directory, including trailing slash" << endl;
cerr << " -l read input filenames from list-file (one filename per line)" << endl;
cerr << " -u produce one unified encoded corpus (in case multiple corpora are specified)" << endl;
cerr << " -e extend specified class file with unseen classes" << endl;
cerr << " -n ignore newlines in input, treat everything as one text blob" << endl;
cerr << " -U encode all unseen classes using one special unknown class" << endl;
cerr << " -t word occurrence threshold (default: 1)" << endl;
cerr << " -F Import frequency list (word \\t frequency, per line)" << endl;
cerr << " -V Import vocabulary file (one word per line)" << endl;
}
int main(int argc, char* argv[]) {
string classfile = "";
string corpusfile = "";
string outputprefix = "";
string outputdirectoryprefix = "";
string freqlistfile = "";
string vocabfile = "";
vector<string> corpusfiles;
bool unified = false;
bool extend = false;
bool allowunknown = false;
bool ignorenewlines = false;
int threshold = 0;
ifstream listin;
string tmpfilename;
char c;
while ((c = getopt(argc, argv, "f:hc:o:d:ul:eUt:F:V:n")) != -1) {
switch (c) {
case 'f': //keep for backward compatibility
corpusfile = optarg;
corpusfiles.push_back(corpusfile);
break;
case 'c': classfile = optarg; break;
case 'o': outputprefix = optarg; break;
case 'd': outputdirectoryprefix = optarg; break;
case 'u': unified = true; break;
case 'n': ignorenewlines = true; break;
case 'e': extend = true; break;
case 'U': allowunknown = true; break;
case 't': threshold = atoi(optarg); break;
case 'l':
listin.open(optarg);
if (listin.good()) {
while (!listin.eof()) {
listin >> tmpfilename;
corpusfiles.push_back(tmpfilename);
}
} else {
cerr << "Can't read " << optarg << endl;
abort();
}
listin.close();
break;
case 'F': freqlistfile = optarg; break;
case 'V': vocabfile = optarg; break;
case 'h': usage(); exit(0);
default: cerr << "Unknown option: -" << optopt << endl; exit(2);
}
}
for (int i = optind; i < argc; i++) {
string tmp = argv[i];
corpusfiles.push_back(tmp);
}
if (corpusfiles.empty() && (freqlistfile.empty())) {
usage();
exit(2);
}
if (!corpusfiles.empty()) {
corpusfile = corpusfiles[0]; //only for extension determination
}
if (outputprefix.empty()) {
if (corpusfile.find_last_of("/") == string::npos) {
outputprefix = corpusfile;
} else {
outputprefix = corpusfile.substr(corpusfile.find_last_of("/") + 1);
}
strip_extension(outputprefix, "bz2");
strip_extension(outputprefix, "xml");
strip_extension(outputprefix, "txt");
}
if ((!freqlistfile.empty()) && (outputprefix.empty())) {
if (freqlistfile.find_last_of("/") == string::npos) {
outputprefix = freqlistfile;
} else {
outputprefix = freqlistfile.substr(freqlistfile.find_last_of("/") + 1);
}
strip_extension(outputprefix, "bz2");
strip_extension(outputprefix, "xml");
strip_extension(outputprefix, "txt");
}
ClassEncoder classencoder;
string prefixedoutputprefix = outputdirectoryprefix + outputprefix;
if (!classfile.empty()) {
cerr << "Loading classes from file" << endl;
classencoder = ClassEncoder(classfile);
if (extend) {
cerr << "Building classes from corpus (extending existing classes)" << endl;
classencoder.build(corpusfiles, false, threshold, vocabfile);
classencoder.save(prefixedoutputprefix + ".colibri.cls");
cerr << "Built " << prefixedoutputprefix << ".colibri.cls , extending " << classfile << endl;
}
} else if (!freqlistfile.empty()) {
cerr << "Building classes from imported frequency list or vocabulary file" << endl;
classencoder = ClassEncoder();
classencoder.buildclasses_freqlist(freqlistfile);
classencoder.save(prefixedoutputprefix + ".colibri.cls");
cerr << "Built " << prefixedoutputprefix << ".colibri.cls" << endl;
} else {
cerr << "Building classes from corpus" << endl;
classencoder = ClassEncoder();
classencoder.build(corpusfiles, false, threshold, vocabfile);
classencoder.save(prefixedoutputprefix + ".colibri.cls");
cerr << "Built " << prefixedoutputprefix << ".colibri.cls" << endl;
}
unsigned int highestclass = classencoder.gethighestclass();
for (size_t i = 0; i < corpusfiles.size(); i++) {
string outfile = corpusfiles[i];
if (outfile.find_last_of("/") != string::npos) {
outfile = outfile.substr(outfile.find_last_of("/") + 1);
}
if (unified)
outfile = outputprefix;
strip_extension(outfile, "bz2");
strip_extension(outfile, "txt");
strip_extension(outfile, "xml");
if (outputdirectoryprefix.compare("")) // output directory given
{
outfile = outputdirectoryprefix + "/" + outfile;
}
cerr << "Encoding corpus " << corpusfiles[i] << " to " << outfile << ".colibri.dat" << endl;
classencoder.encodefile(corpusfiles[i], outfile + ".colibri.dat", allowunknown, extend, (unified && i > 0), ignorenewlines);
cerr << "...Done" << endl;
}
if (classencoder.gethighestclass() > highestclass) {
if (extend) {
classencoder.save(outputprefix + ".colibri.cls");
cerr << "Built " << outputprefix << ".colibri.cls" << endl;
} else {
cerr << "WARNING: classes were added but the result was ignored! Use -e!" << endl;
}
}
}