Skip to content

Commit 177158c

Browse files
committed
updated turtle rdfparser such that it passes the turtle test suite
1 parent 3b5a6df commit 177158c

12 files changed

Lines changed: 2149 additions & 307 deletions

File tree

core/src/main/java/com/github/jsonldjava/core/JSONLDConsts.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ public class JSONLDConsts {
1010
public static final String XSD_BOOLEAN = XSD_NS + "boolean";
1111
public static final String XSD_DOUBLE = XSD_NS + "double";
1212
public static final String XSD_INTEGER = XSD_NS + "integer";
13+
public static final String XSD_FLOAT = XSD_NS + "float";
14+
public static final String XSD_DECIMAL = XSD_NS + "decimal";
1315
public static final String XSD_ANYURI = XSD_NS + "anyURI";
1416
public static final String XSD_STRING = XSD_NS + "string";
1517

core/src/main/java/com/github/jsonldjava/core/JSONLDProcessingError.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,12 @@ public Map<String,Object> getDetails() {
4141
return details;
4242
}
4343

44+
@Override
45+
public String getMessage() {
46+
String msg = super.getMessage();
47+
for (String key: details.keySet()) {
48+
msg += " {" + key + ":" + details.get(key) + "}";
49+
}
50+
return msg;
51+
}
4452
}

core/src/main/java/com/github/jsonldjava/core/JSONLDTripleCallback.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
package com.github.jsonldjava.core;
22

3-
import java.util.Map;
4-
53
/**
64
*
75
* @author Tristan

core/src/main/java/com/github/jsonldjava/core/RDFDataset.java

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,13 @@ public void getNamespace(String ns) {
288288
context.get(ns);
289289
}
290290

291+
/**
292+
* clears all the namespaces in this dataset
293+
*/
294+
public void clearNamespaces() {
295+
context.clear();
296+
}
297+
291298
public Map<String, String> getNamespaces() {
292299
return context;
293300
}
@@ -421,18 +428,21 @@ void graphToRDF(String graphName, Map<String,Object> graph) {
421428
continue;
422429
}
423430

431+
// RDF subjects
432+
Node subject;
433+
if (id == null) {
434+
// TODO: this is a hack to handle the node generator not handling blank nodes that have a "@list" property alongside other properties
435+
subject = new BlankNode(namer.getName("undefined"));
436+
} else if (id.indexOf("_:") == 0) {
437+
subject = new BlankNode(namer.getName(id));
438+
} else {
439+
subject = new IRI(id);
440+
}
441+
442+
// RDF predicates
443+
Node predicate = new IRI(property);
444+
424445
for (Object item : (List<Object>) items) {
425-
// RDF subjects
426-
Node subject;
427-
if (id.indexOf("_:") == 0) {
428-
subject = new BlankNode(namer.getName(id));
429-
} else {
430-
subject = new IRI(id);
431-
}
432-
433-
// RDF predicates
434-
Node predicate = new IRI(property);
435-
436446
// convert @list to triples
437447
if (isList(item)) {
438448
//listToRDF((List<Object>) ((Map<String, Object>) item).get("@list"), namer, subject, predicate, rval);

core/src/main/java/com/github/jsonldjava/core/RDFDatasetUtils.java

Lines changed: 145 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22

33
import static com.github.jsonldjava.core.JSONLDConsts.*;
44
import static com.github.jsonldjava.core.JSONLDUtils.*;
5+
import static com.github.jsonldjava.core.Regex.HEX;
56

67
import java.text.DecimalFormat;
78
import java.util.ArrayList;
89
import java.util.Collections;
10+
import java.util.HashSet;
11+
import java.util.IllegalFormatConversionException;
912
import java.util.LinkedHashMap;
1013
import java.util.List;
1114
import java.util.Map;
@@ -219,7 +222,7 @@ static String toNQuad(RDFDataset.Quad triple, String graphName, String bnode) {
219222

220223
// subject is an IRI or bnode
221224
if (s.isIRI()) {
222-
quad += "<" + s.getValue() + ">";
225+
quad += "<" + escape(s.getValue()) + ">";
223226
}
224227
// normalization mode
225228
else if (bnode != null) {
@@ -231,11 +234,11 @@ else if (bnode != null) {
231234
}
232235

233236
// predicate is always an IRI
234-
quad += " <" + p.getValue() + "> ";
237+
quad += " <" + escape(p.getValue()) + "> ";
235238

236239
// object is IRI, bnode or literal
237240
if (o.isIRI()) {
238-
quad += "<" + o.getValue() + ">";
241+
quad += "<" + escape(o.getValue()) + ">";
239242
}
240243
else if (o.isBlankNode()) {
241244
// normalization mode
@@ -248,24 +251,19 @@ else if (o.isBlankNode()) {
248251
}
249252
}
250253
else {
251-
String escaped = o.getValue()
252-
.replaceAll("\\\\", "\\\\\\\\")
253-
.replaceAll("\\t", "\\\\t")
254-
.replaceAll("\\n", "\\\\n")
255-
.replaceAll("\\r", "\\\\r")
256-
.replaceAll("\\\"", "\\\\\"");
254+
String escaped = escape(o.getValue());
257255
quad += "\"" + escaped + "\"";
258256
if (RDF_LANGSTRING.equals(o.getDatatype())) {
259257
quad += "@" + o.getLanguage();
260258
} else if (!XSD_STRING.equals(o.getDatatype())) {
261-
quad += "^^<" + o.getDatatype() + ">";
259+
quad += "^^<" + escape(o.getDatatype()) + ">";
262260
}
263261
}
264262

265263
// graph
266264
if (graphName != null) {
267265
if (graphName.indexOf("_:") != 0) {
268-
quad += " <" + graphName + ">";
266+
quad += " <" + escape(graphName) + ">";
269267
}
270268
else if (bnode != null) {
271269
quad += " _:g";
@@ -282,14 +280,136 @@ else if (bnode != null) {
282280
static String toNQuad(RDFDataset.Quad triple, String graphName) {
283281
return toNQuad(triple, graphName, null);
284282
}
285-
286-
public static class Regex {
283+
284+
final private static Pattern UCHAR_MATCHED = Pattern.compile("\\u005C(?:([tbnrf\\\"'])|(?:u(" + HEX + "{4}))|(?:U(" + HEX + "{8})))");
285+
286+
public static String unescape(String str) {
287+
String rval = str;
288+
if (str != null) {
289+
Matcher m = UCHAR_MATCHED.matcher(str);
290+
while (m.find()) {
291+
String uni = m.group(0);
292+
if (m.group(1) == null) {
293+
String hex = m.group(2) != null ? m.group(2) : m.group(3);
294+
int v = Integer.parseInt(hex, 16);//hex = hex.replaceAll("^(?:00)+", "");
295+
if (v > 0xFFFF) {
296+
// deal with UTF-32
297+
//Integer v = Integer.parseInt(hex, 16);
298+
int vt = v - 0x10000;
299+
int vh = vt >> 10;
300+
int v1 = vt & 0x3FF;
301+
int w1 = 0xD800 + vh;
302+
int w2 = 0xDC00 + v1;
303+
304+
StringBuffer b = new StringBuffer();
305+
b.appendCodePoint(w1);
306+
b.appendCodePoint(w2);
307+
uni = b.toString();
308+
} else {
309+
uni = Character.toString((char)v);
310+
}
311+
} else {
312+
char c = m.group(1).charAt(0);
313+
switch (c) {
314+
case 'b':
315+
uni = "\b";
316+
break;
317+
case 'n':
318+
uni = "\n";
319+
break;
320+
case 't':
321+
uni = "\t";
322+
break;
323+
case 'f':
324+
uni = "\f";
325+
break;
326+
case 'r':
327+
uni = "\r";
328+
break;
329+
case '\'':
330+
uni = "'";
331+
break;
332+
case '\"':
333+
uni = "\"";
334+
break;
335+
case '\\':
336+
uni = "\\";
337+
break;
338+
default:
339+
// do nothing
340+
continue;
341+
}
342+
}
343+
String pat = Pattern.quote(m.group(0));
344+
String x = Integer.toHexString((int)uni.charAt(0));
345+
rval = rval.replaceAll(pat, uni);
346+
}
347+
}
348+
return rval;
349+
}
350+
351+
public static String escape(String str) {
352+
String rval = "";
353+
for (int i = 0 ; i < str.length() ; i++) {
354+
char hi = str.charAt(i);
355+
if (hi <= 0x8 || hi == 0xB || hi == 0xC || (hi >= 0xE && hi <= 0x1F) ||
356+
(hi >= 0x7F && hi <= 0xA0) || // 0xA0 is end of non-printable latin-1 supplement characters
357+
((hi >= 0x24F // 0x24F is the end of latin extensions
358+
&& !Character.isHighSurrogate(hi))
359+
// TODO: there's probably a lot of other characters that shouldn't be escaped that
360+
// fall outside these ranges, this is one example from the json-ld tests
361+
)) {
362+
rval += String.format("\\u%04x", (int)hi);
363+
}
364+
else if (Character.isHighSurrogate(hi)) {
365+
char lo = str.charAt(++i);
366+
int c = (hi << 10) + lo + (0x10000 - (0xD800 << 10) - 0xDC00);
367+
rval += String.format("\\U%08x", c);
368+
} else {
369+
switch (hi) {
370+
case '\b':
371+
rval += "\\b";
372+
break;
373+
case '\n':
374+
rval += "\\n";
375+
break;
376+
case '\t':
377+
rval += "\\t";
378+
break;
379+
case '\f':
380+
rval += "\\f";
381+
break;
382+
case '\r':
383+
rval += "\\r";
384+
break;
385+
//case '\'':
386+
// rval += "\\'";
387+
// break;
388+
case '\"':
389+
rval += "\\\"";
390+
//rval += "\\u0022";
391+
break;
392+
case '\\':
393+
rval += "\\\\";
394+
break;
395+
default:
396+
// just put the char as is
397+
rval += hi;
398+
break;
399+
}
400+
}
401+
}
402+
return rval;
403+
}
404+
405+
private static class Regex {
287406
// define partial regexes
288-
final public static Pattern IRI = Pattern.compile("(?:<([^:]+:[^>]*)>)");
407+
//final public static Pattern IRI = Pattern.compile("(?:<([^:]+:[^>]*)>)");
408+
final public static Pattern IRI = Pattern.compile("(?:<([^>]*)>)");
289409
final public static Pattern BNODE = Pattern.compile("(_:(?:[A-Za-z][A-Za-z0-9]*))");
290410
final public static Pattern PLAIN = Pattern.compile("\"([^\"\\\\]*(?:\\\\.[^\"\\\\]*)*)\"");
291411
final public static Pattern DATATYPE = Pattern.compile("(?:\\^\\^" + IRI + ")");
292-
final public static Pattern LANGUAGE = Pattern.compile("(?:@([a-z]+(?:-[a-z0-9]+)*))");
412+
final public static Pattern LANGUAGE = Pattern.compile("(?:@([a-z]+(?:-[a-zA-Z0-9]+)*))");
293413
final public static Pattern LITERAL = Pattern.compile("(?:" + PLAIN + "(?:" + DATATYPE + "|" + LANGUAGE + ")?)");
294414
final public static Pattern WS = Pattern.compile("[ \\t]+");
295415
final public static Pattern WSO = Pattern.compile("[ \\t]*");
@@ -304,17 +424,6 @@ public static class Regex {
304424

305425
// full quad regex
306426
final public static Pattern QUAD = Pattern.compile("^" + WSO + SUBJECT + PROPERTY + OBJECT + GRAPH + WSO + "$");
307-
308-
// turtle prefix line
309-
final public static Pattern TTL_PREFIX_NS = Pattern.compile("(?:([a-zA-Z0-9\\.]*):)"); // TODO: chars can be more
310-
final public static Pattern TTL_PREFIX_ID = Pattern.compile("^@prefix" + WS + TTL_PREFIX_NS + WS + IRI + WSO + "\\." + WSO + "$");
311-
312-
final public static Pattern IWSO = Pattern.compile("^" + WSO);
313-
final public static Pattern TTL_SUBJECT = Pattern.compile("^(?:" + TTL_PREFIX_NS + "([^ \\t]+)|" + BNODE + "|" + IRI + ")" + WS);
314-
final public static Pattern TTL_PREDICATE = Pattern.compile("^(?:" + TTL_PREFIX_NS + "([^ \\t]+)|" + IRI + ")" + WS);
315-
final public static Pattern TTL_DATATYPE = Pattern.compile("(?:\\^\\^" + TTL_PREFIX_NS + "([^ \\t]+)|" + IRI + ")");
316-
final public static Pattern TTL_LITERAL = Pattern.compile("(?:" + PLAIN + "(?:" + TTL_DATATYPE + "|" + LANGUAGE + ")?)");
317-
final public static Pattern TTL_OBJECT = Pattern.compile("^(?:" + TTL_PREFIX_NS + "([^,; \\t]+)([,;\\.]?)|" + IRI + "|" + BNODE + "|" + TTL_LITERAL + ")" + WSO);
318427
}
319428

320429
/**
@@ -350,38 +459,33 @@ public static RDFDataset parseNQuads(String input) throws JSONLDProcessingError
350459
// get subject
351460
RDFDataset.Node subject;
352461
if (match.group(1) != null) {
353-
subject = new RDFDataset.IRI(match.group(1));
462+
subject = new RDFDataset.IRI(unescape(match.group(1)));
354463
} else {
355-
subject = new RDFDataset.BlankNode(match.group(2));
464+
subject = new RDFDataset.BlankNode(unescape(match.group(2)));
356465
}
357466

358467
// get predicate
359-
RDFDataset.Node predicate = new RDFDataset.IRI(match.group(3));
468+
RDFDataset.Node predicate = new RDFDataset.IRI(unescape(match.group(3)));
360469

361470
// get object
362471
RDFDataset.Node object;
363472
if (match.group(4) != null) {
364-
object = new RDFDataset.IRI(match.group(4));
473+
object = new RDFDataset.IRI(unescape(match.group(4)));
365474
} else if (match.group(5) != null) {
366-
object = new RDFDataset.BlankNode(match.group(5));
475+
object = new RDFDataset.BlankNode(unescape(match.group(5)));
367476
} else {
368-
final String language = match.group(8);
369-
final String datatype = match.group(7) != null ? match.group(7) : match.group(8) != null ? RDF_LANGSTRING : XSD_STRING;
370-
final String unescaped = match.group(6)
371-
.replaceAll("\\\\\\\\", "\\\\")
372-
.replaceAll("\\\\t", "\\t")
373-
.replaceAll("\\\\n", "\\n")
374-
.replaceAll("\\\\r", "\\r")
375-
.replaceAll("\\\\\"", "\\\"");
477+
final String language = unescape(match.group(8));
478+
final String datatype = match.group(7) != null ? unescape(match.group(7)) : match.group(8) != null ? RDF_LANGSTRING : XSD_STRING;
479+
final String unescaped = unescape(match.group(6));
376480
object = new RDFDataset.Literal(unescaped, datatype, language);
377481
}
378482

379483
// get graph name ('@default' is used for the default graph)
380484
String name = "@default";
381485
if (match.group(9) != null) {
382-
name = match.group(9);
486+
name = unescape(match.group(9));
383487
} else if (match.group(10) != null) {
384-
name = match.group(10);
488+
name = unescape(match.group(10));
385489
}
386490

387491
RDFDataset.Quad triple = new RDFDataset.Quad(subject, predicate, object, name);
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package com.github.jsonldjava.core;
2+
3+
import java.util.regex.Pattern;
4+
5+
public class Regex {
6+
final public static Pattern TRICKY_UTF_CHARS = Pattern.compile(
7+
//("1.7".equals(System.getProperty("java.specification.version")) ? "[\\x{10000}-\\x{EFFFF}]" :
8+
"[\uD800\uDC00-\uDB7F\uDFFF]" // this seems to work with jdk1.6
9+
);
10+
// for ttl
11+
final public static Pattern PN_CHARS_BASE = Pattern.compile("[a-zA-Z]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u02FF]|[\\u0370-\\u037D]|[\\u037F-\\u1FFF]|"
12+
+ "[\\u200C-\\u200D]|[\\u2070-\\u218F]|[\\u2C00-\\u2FEF]|[\\u3001-\\uD7FF]|[\\uF900-\\uFDCF]|[\\uFDF0-\\uFFFD]|" + TRICKY_UTF_CHARS);
13+
final public static Pattern PN_CHARS_U = Pattern.compile(PN_CHARS_BASE + "|[_]");
14+
final public static Pattern PN_CHARS = Pattern.compile(PN_CHARS_U + "|[-0-9]|[\\u00B7]|[\\u0300-\\u036F]|[\\u203F-\\u2040]");
15+
final public static Pattern PN_PREFIX = Pattern.compile("(?:(?:" + PN_CHARS_BASE + ")(?:(?:" + PN_CHARS + "|[\\.])*(?:" + PN_CHARS + "))?)");
16+
final public static Pattern HEX = Pattern.compile("[0-9A-Fa-f]");
17+
final public static Pattern PN_LOCAL_ESC = Pattern.compile("[\\\\][_~\\.\\-!$&'\\(\\)*+,;=/?#@%]");
18+
final public static Pattern PERCENT = Pattern.compile("%" + HEX + HEX);
19+
final public static Pattern PLX = Pattern.compile(PERCENT + "|" + PN_LOCAL_ESC);
20+
final public static Pattern PN_LOCAL = Pattern.compile("((?:" + PN_CHARS_U + "|[:]|[0-9]|" + PLX + ")(?:(?:" + PN_CHARS + "|[.]|[:]|" + PLX + ")*(?:" + PN_CHARS + "|[:]|" + PLX + "))?)");
21+
final public static Pattern PNAME_NS = Pattern.compile("((?:" + PN_PREFIX + ")?):");
22+
final public static Pattern PNAME_LN = Pattern.compile("" + PNAME_NS + PN_LOCAL);
23+
final public static Pattern UCHAR = Pattern.compile("\\u005Cu" + HEX + HEX + HEX + HEX + "|\\u005CU" + HEX + HEX + HEX + HEX + HEX + HEX + HEX + HEX);
24+
final public static Pattern ECHAR = Pattern.compile("\\u005C[tbnrf\\u005C\"']");
25+
final public static Pattern IRIREF = Pattern.compile("(?:<((?:[^\\x00-\\x20<>\"{}|\\^`\\\\]|" + UCHAR + ")*)>)");
26+
final public static Pattern BLANK_NODE_LABEL = Pattern.compile("(?:_:((?:" + PN_CHARS_U + "|[0-9])(?:(?:" + PN_CHARS + "|[\\.])*(?:" + PN_CHARS + "))?))");
27+
final public static Pattern WS = Pattern.compile("[ \t\r\n]");
28+
final public static Pattern WS_0_N = Pattern.compile(WS +"*");
29+
final public static Pattern WS_0_1 = Pattern.compile(WS +"?");
30+
final public static Pattern WS_1_N = Pattern.compile(WS +"+");
31+
final public static Pattern STRING_LITERAL_QUOTE = Pattern.compile("\"(?:[^\\u0022\\u005C\\u000A\\u000D]|(?:" + ECHAR + ")|(?:" + UCHAR + "))*\"");
32+
final public static Pattern STRING_LITERAL_SINGLE_QUOTE = Pattern.compile("'(?:[^\\u0027\\u005C\\u000A\\u000D]|(?:" + ECHAR + ")|(?:" + UCHAR + "))*'");
33+
final public static Pattern STRING_LITERAL_LONG_SINGLE_QUOTE = Pattern.compile("'''(?:(?:(?:'|'')?[^'\\\\])|" + ECHAR + "|" + UCHAR + ")*'''");
34+
final public static Pattern STRING_LITERAL_LONG_QUOTE = Pattern.compile("\"\"\"(?:(?:(?:\"|\"\")?[^\\\"\\\\])|" + ECHAR + "|" + UCHAR + ")*\"\"\"");
35+
final public static Pattern LANGTAG = Pattern.compile("(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*))");
36+
final public static Pattern INTEGER = Pattern.compile("[+-]?[0-9]+");
37+
final public static Pattern DECIMAL = Pattern.compile("[+-]?[0-9]*\\.[0-9]+");
38+
final public static Pattern EXPONENT = Pattern.compile("[eE][+-]?[0-9]+");
39+
final public static Pattern DOUBLE = Pattern.compile("[+-]?(?:(?:[0-9]+\\.[0-9]*" + EXPONENT + ")|(?:\\.[0-9]+" + EXPONENT + ")|(?:[0-9]+" + EXPONENT + "))");
40+
}

0 commit comments

Comments
 (0)