-
Notifications
You must be signed in to change notification settings - Fork 227
Expand file tree
/
Copy pathPyReader.java
More file actions
305 lines (270 loc) · 10.3 KB
/
PyReader.java
File metadata and controls
305 lines (270 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
/* Copyright (c) Jython Developers */
package org.python.modules._csv;
import org.python.core.PyIterator;
import org.python.core.PyList;
import org.python.core.PyObject;
import org.python.core.PyString;
import org.python.core.PyType;
import org.python.core.Visitproc;
import org.python.expose.ExposedGet;
import org.python.expose.ExposedType;
/**
* CSV file reader.
*
* Analogous to CPython's _csv.c::ReaderObj struct.
*/
@ExposedType(name = "_csv.reader", doc = PyReader.reader_doc)
public class PyReader extends PyIterator {
public static final PyType TYPE = PyType.fromClass(PyReader.class);
public static final String reader_doc =
"CSV reader\n" +
"\n" +
"Reader objects are responsible for reading and parsing tabular data\n" +
"in CSV format.\n";
/** Parsing Dialect. */
@ExposedGet
public PyDialect dialect;
/** The current line number. */
@ExposedGet
public int line_num = 0;
/** The underlying input iterator. */
private PyObject input_iter;
/** Current CSV parse state. */
private ParserState state = ParserState.START_RECORD;
/** Field list for current record. */
private PyList fields = new PyList();
/** Current field builder in here. */
private StringBuffer field = new StringBuffer(INITIAL_BUILDER_CAPACITY);
/** Whether the field should be treated as numeric. */
private boolean numeric_field = false;
/** Initial capacity of the field StringBuilder. */
private static final int INITIAL_BUILDER_CAPACITY = 4096;
public PyReader(PyObject input_iter, PyDialect dialect) {
this.input_iter = input_iter;
this.dialect = dialect;
}
public PyObject __iternext__() {
PyObject lineobj;
PyObject fields;
String line;
char c;
int linelen;
parse_reset();
do {
lineobj = input_iter.__iternext__();
if (lineobj == null) {
// End of input OR exception
if (field.length() != 0 || state == ParserState.IN_QUOTED_FIELD) {
if (dialect.strict) {
throw _csv.Error("unexpected end of data");
} else {
parse_save_field();
break;
}
}
return null;
}
line_num++;
line = lineobj.toString();
linelen = line.length();
for (int i = 0; i < linelen; i++) {
c = line.charAt(i);
if (c == '\0') {
throw _csv.Error("line contains NULL byte");
}
parse_process_char(c);
}
parse_process_char('\0');
} while (state != ParserState.START_RECORD);
fields = this.fields;
this.fields = new PyList();
return fields;
}
@SuppressWarnings("fallthrough")
private void parse_process_char(char c) {
switch (state) {
case START_RECORD:
// start of record
if (c == '\0') {
// empty line - return []
break;
} else if (c == '\n' || c == '\r') {
state = ParserState.EAT_CRNL;
break;
}
// normal character - handle as START_FIELD
state = ParserState.START_FIELD;
// *** fallthru ***
case START_FIELD:
// expecting field
if (c == '\n' || c == '\r' || c == '\0') {
// save empty field - return [fields]
parse_save_field();
state = c == '\0' ? ParserState.START_RECORD : ParserState.EAT_CRNL;
} else if (c == dialect.quotechar && dialect.quoting != QuoteStyle.QUOTE_NONE) {
// start quoted field
state = ParserState.IN_QUOTED_FIELD;
} else if (c == dialect.escapechar) {
// possible escaped character
state = ParserState.ESCAPED_CHAR;
} else if (c == ' ' && dialect.skipinitialspace) {
// ignore space at start of field
;
} else if (c == dialect.delimiter) {
// save empty field
parse_save_field();
} else {
// begin new unquoted field
if (dialect.quoting == QuoteStyle.QUOTE_NONNUMERIC) {
numeric_field = true;
}
parse_add_char(c);
state = ParserState.IN_FIELD;
}
break;
case ESCAPED_CHAR:
if (c == '\0') {
c = '\n';
}
parse_add_char(c);
state = ParserState.IN_FIELD;
break;
case IN_FIELD:
// in unquoted field
if (c == '\n' || c == '\r' || c == '\0') {
// end of line - return [fields]
parse_save_field();
state = c == '\0' ? ParserState.START_RECORD : ParserState.EAT_CRNL;
} else if (c == dialect.escapechar) {
// possible escaped character
state = ParserState.ESCAPED_CHAR;
} else if (c == dialect.delimiter) {
// save field - wait for new field
parse_save_field();
state = ParserState.START_FIELD;
} else {
// normal character - save in field
parse_add_char(c);
}
break;
case IN_QUOTED_FIELD:
// in quoted field
if (c == '\0') {
;
} else if (c == dialect.escapechar) {
// Possible escape character
state = ParserState.ESCAPE_IN_QUOTED_FIELD;
} else if (c == dialect.quotechar && dialect.quoting != QuoteStyle.QUOTE_NONE) {
if (dialect.doublequote) {
// doublequote; " represented by ""
state = ParserState.QUOTE_IN_QUOTED_FIELD;
} else {
// end of quote part of field
state = ParserState.IN_FIELD;
}
} else {
// normal character - save in field
parse_add_char(c);
}
break;
case ESCAPE_IN_QUOTED_FIELD:
if (c == '\0') {
c = '\n';
}
parse_add_char(c);
state = ParserState.IN_QUOTED_FIELD;
break;
case QUOTE_IN_QUOTED_FIELD:
// doublequote - seen a quote in an quoted field
if (dialect.quoting != QuoteStyle.QUOTE_NONE && c == dialect.quotechar) {
// save "" as "
parse_add_char(c);
state = ParserState.IN_QUOTED_FIELD;
} else if (c == dialect.delimiter) {
// save field - wait for new field
parse_save_field();
state = ParserState.START_FIELD;
} else if (c == '\n' || c == '\r' || c == '\0') {
// end of line - return [fields]
parse_save_field();
state = c == '\0' ? ParserState.START_RECORD : ParserState.EAT_CRNL;
} else if (!dialect.strict) {
parse_add_char(c);
state = ParserState.IN_FIELD;
} else {
// illegal
throw _csv.Error(String.format("'%c' expected after '%c'",
dialect.delimiter, dialect.quotechar));
}
break;
case EAT_CRNL:
if (c == '\n' || c == '\r') {
;
} else if (c == '\0') {
state = ParserState.START_RECORD;
} else {
String err = "new-line character seen in unquoted field - do you need to "
+ "open the file in universal-newline mode?";
throw _csv.Error(err);
}
break;
}
}
private void parse_reset() {
fields = new PyList();
state = ParserState.START_RECORD;
numeric_field = false;
}
private void parse_save_field() {
PyObject field;
field = new PyString(this.field.toString());
if (numeric_field) {
numeric_field = false;
field = field.__float__();
}
fields.append(field);
// XXX: fastest way to clear StringBuffer?
this.field = new StringBuffer(INITIAL_BUILDER_CAPACITY);
}
private void parse_add_char(char c) {
int field_len = field.length();
if (field_len >= _csv.field_limit) {
throw _csv.Error(String.format("field larger than field limit (%d)",
_csv.field_limit));
}
field.append(c);
}
/**
* State of the CSV reader.
*/
private enum ParserState {
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD,
QUOTE_IN_QUOTED_FIELD, EAT_CRNL;
}
/* Traverseproc implementation */
@Override
public int traverse(Visitproc visit, Object arg) {
int retVal = super.traverse(visit, arg);
if (retVal != 0) {
return retVal;
}
if (dialect != null) {
retVal = visit.visit(dialect, arg);
if (retVal != 0) {
return retVal;
}
}
if (input_iter != null) {
retVal = visit.visit(input_iter, arg);
if (retVal != 0) {
return retVal;
}
}
return fields != null ? visit.visit(fields, arg) : 0;
}
@Override
public boolean refersDirectlyTo(PyObject ob) {
return ob == null && (ob == fields || ob == dialect
|| ob == input_iter || super.refersDirectlyTo(ob));
}
}