22
33import static com .github .jsonldjava .core .JSONLDConsts .*;
44import static com .github .jsonldjava .core .JSONLDUtils .*;
5+ import static com .github .jsonldjava .core .Regex .HEX ;
56
67import java .text .DecimalFormat ;
78import java .util .ArrayList ;
89import java .util .Collections ;
10+ import java .util .HashSet ;
11+ import java .util .IllegalFormatConversionException ;
912import java .util .LinkedHashMap ;
1013import java .util .List ;
1114import java .util .Map ;
@@ -219,7 +222,7 @@ static String toNQuad(RDFDataset.Quad triple, String graphName, String bnode) {
219222
220223 // subject is an IRI or bnode
221224 if (s .isIRI ()) {
222- quad += "<" + s .getValue () + ">" ;
225+ quad += "<" + escape ( s .getValue () ) + ">" ;
223226 }
224227 // normalization mode
225228 else if (bnode != null ) {
@@ -231,11 +234,11 @@ else if (bnode != null) {
231234 }
232235
233236 // predicate is always an IRI
234- quad += " <" + p .getValue () + "> " ;
237+ quad += " <" + escape ( p .getValue () ) + "> " ;
235238
236239 // object is IRI, bnode or literal
237240 if (o .isIRI ()) {
238- quad += "<" + o .getValue () + ">" ;
241+ quad += "<" + escape ( o .getValue () ) + ">" ;
239242 }
240243 else if (o .isBlankNode ()) {
241244 // normalization mode
@@ -248,24 +251,19 @@ else if (o.isBlankNode()) {
248251 }
249252 }
250253 else {
251- String escaped = o .getValue ()
252- .replaceAll ("\\ \\ " , "\\ \\ \\ \\ " )
253- .replaceAll ("\\ t" , "\\ \\ t" )
254- .replaceAll ("\\ n" , "\\ \\ n" )
255- .replaceAll ("\\ r" , "\\ \\ r" )
256- .replaceAll ("\\ \" " , "\\ \\ \" " );
254+ String escaped = escape (o .getValue ());
257255 quad += "\" " + escaped + "\" " ;
258256 if (RDF_LANGSTRING .equals (o .getDatatype ())) {
259257 quad += "@" + o .getLanguage ();
260258 } else if (!XSD_STRING .equals (o .getDatatype ())) {
261- quad += "^^<" + o .getDatatype () + ">" ;
259+ quad += "^^<" + escape ( o .getDatatype () ) + ">" ;
262260 }
263261 }
264262
265263 // graph
266264 if (graphName != null ) {
267265 if (graphName .indexOf ("_:" ) != 0 ) {
268- quad += " <" + graphName + ">" ;
266+ quad += " <" + escape ( graphName ) + ">" ;
269267 }
270268 else if (bnode != null ) {
271269 quad += " _:g" ;
@@ -282,14 +280,136 @@ else if (bnode != null) {
282280 static String toNQuad (RDFDataset .Quad triple , String graphName ) {
283281 return toNQuad (triple , graphName , null );
284282 }
285-
286- public static class Regex {
283+
284+ final private static Pattern UCHAR_MATCHED = Pattern .compile ("\\ u005C(?:([tbnrf\\ \" '])|(?:u(" + HEX + "{4}))|(?:U(" + HEX + "{8})))" );
285+
286+ public static String unescape (String str ) {
287+ String rval = str ;
288+ if (str != null ) {
289+ Matcher m = UCHAR_MATCHED .matcher (str );
290+ while (m .find ()) {
291+ String uni = m .group (0 );
292+ if (m .group (1 ) == null ) {
293+ String hex = m .group (2 ) != null ? m .group (2 ) : m .group (3 );
294+ int v = Integer .parseInt (hex , 16 );//hex = hex.replaceAll("^(?:00)+", "");
295+ if (v > 0xFFFF ) {
296+ // deal with UTF-32
297+ //Integer v = Integer.parseInt(hex, 16);
298+ int vt = v - 0x10000 ;
299+ int vh = vt >> 10 ;
300+ int v1 = vt & 0x3FF ;
301+ int w1 = 0xD800 + vh ;
302+ int w2 = 0xDC00 + v1 ;
303+
304+ StringBuffer b = new StringBuffer ();
305+ b .appendCodePoint (w1 );
306+ b .appendCodePoint (w2 );
307+ uni = b .toString ();
308+ } else {
309+ uni = Character .toString ((char )v );
310+ }
311+ } else {
312+ char c = m .group (1 ).charAt (0 );
313+ switch (c ) {
314+ case 'b' :
315+ uni = "\b " ;
316+ break ;
317+ case 'n' :
318+ uni = "\n " ;
319+ break ;
320+ case 't' :
321+ uni = "\t " ;
322+ break ;
323+ case 'f' :
324+ uni = "\f " ;
325+ break ;
326+ case 'r' :
327+ uni = "\r " ;
328+ break ;
329+ case '\'' :
330+ uni = "'" ;
331+ break ;
332+ case '\"' :
333+ uni = "\" " ;
334+ break ;
335+ case '\\' :
336+ uni = "\\ " ;
337+ break ;
338+ default :
339+ // do nothing
340+ continue ;
341+ }
342+ }
343+ String pat = Pattern .quote (m .group (0 ));
344+ String x = Integer .toHexString ((int )uni .charAt (0 ));
345+ rval = rval .replaceAll (pat , uni );
346+ }
347+ }
348+ return rval ;
349+ }
350+
351+ public static String escape (String str ) {
352+ String rval = "" ;
353+ for (int i = 0 ; i < str .length () ; i ++) {
354+ char hi = str .charAt (i );
355+ if (hi <= 0x8 || hi == 0xB || hi == 0xC || (hi >= 0xE && hi <= 0x1F ) ||
356+ (hi >= 0x7F && hi <= 0xA0 ) || // 0xA0 is end of non-printable latin-1 supplement characters
357+ ((hi >= 0x24F // 0x24F is the end of latin extensions
358+ && !Character .isHighSurrogate (hi ))
359+ // TODO: there's probably a lot of other characters that shouldn't be escaped that
360+ // fall outside these ranges, this is one example from the json-ld tests
361+ )) {
362+ rval += String .format ("\\ u%04x" , (int )hi );
363+ }
364+ else if (Character .isHighSurrogate (hi )) {
365+ char lo = str .charAt (++i );
366+ int c = (hi << 10 ) + lo + (0x10000 - (0xD800 << 10 ) - 0xDC00 );
367+ rval += String .format ("\\ U%08x" , c );
368+ } else {
369+ switch (hi ) {
370+ case '\b' :
371+ rval += "\\ b" ;
372+ break ;
373+ case '\n' :
374+ rval += "\\ n" ;
375+ break ;
376+ case '\t' :
377+ rval += "\\ t" ;
378+ break ;
379+ case '\f' :
380+ rval += "\\ f" ;
381+ break ;
382+ case '\r' :
383+ rval += "\\ r" ;
384+ break ;
385+ //case '\'':
386+ // rval += "\\'";
387+ // break;
388+ case '\"' :
389+ rval += "\\ \" " ;
390+ //rval += "\\u0022";
391+ break ;
392+ case '\\' :
393+ rval += "\\ \\ " ;
394+ break ;
395+ default :
396+ // just put the char as is
397+ rval += hi ;
398+ break ;
399+ }
400+ }
401+ }
402+ return rval ;
403+ }
404+
405+ private static class Regex {
287406 // define partial regexes
288- final public static Pattern IRI = Pattern .compile ("(?:<([^:]+:[^>]*)>)" );
407+ //final public static Pattern IRI = Pattern.compile("(?:<([^:]+:[^>]*)>)");
408+ final public static Pattern IRI = Pattern .compile ("(?:<([^>]*)>)" );
289409 final public static Pattern BNODE = Pattern .compile ("(_:(?:[A-Za-z][A-Za-z0-9]*))" );
290410 final public static Pattern PLAIN = Pattern .compile ("\" ([^\" \\ \\ ]*(?:\\ \\ .[^\" \\ \\ ]*)*)\" " );
291411 final public static Pattern DATATYPE = Pattern .compile ("(?:\\ ^\\ ^" + IRI + ")" );
292- final public static Pattern LANGUAGE = Pattern .compile ("(?:@([a-z]+(?:-[a-z0 -9]+)*))" );
412+ final public static Pattern LANGUAGE = Pattern .compile ("(?:@([a-z]+(?:-[a-zA-Z0 -9]+)*))" );
293413 final public static Pattern LITERAL = Pattern .compile ("(?:" + PLAIN + "(?:" + DATATYPE + "|" + LANGUAGE + ")?)" );
294414 final public static Pattern WS = Pattern .compile ("[ \\ t]+" );
295415 final public static Pattern WSO = Pattern .compile ("[ \\ t]*" );
@@ -304,17 +424,6 @@ public static class Regex {
304424
305425 // full quad regex
306426 final public static Pattern QUAD = Pattern .compile ("^" + WSO + SUBJECT + PROPERTY + OBJECT + GRAPH + WSO + "$" );
307-
308- // turtle prefix line
309- final public static Pattern TTL_PREFIX_NS = Pattern .compile ("(?:([a-zA-Z0-9\\ .]*):)" ); // TODO: chars can be more
310- final public static Pattern TTL_PREFIX_ID = Pattern .compile ("^@prefix" + WS + TTL_PREFIX_NS + WS + IRI + WSO + "\\ ." + WSO + "$" );
311-
312- final public static Pattern IWSO = Pattern .compile ("^" + WSO );
313- final public static Pattern TTL_SUBJECT = Pattern .compile ("^(?:" + TTL_PREFIX_NS + "([^ \\ t]+)|" + BNODE + "|" + IRI + ")" + WS );
314- final public static Pattern TTL_PREDICATE = Pattern .compile ("^(?:" + TTL_PREFIX_NS + "([^ \\ t]+)|" + IRI + ")" + WS );
315- final public static Pattern TTL_DATATYPE = Pattern .compile ("(?:\\ ^\\ ^" + TTL_PREFIX_NS + "([^ \\ t]+)|" + IRI + ")" );
316- final public static Pattern TTL_LITERAL = Pattern .compile ("(?:" + PLAIN + "(?:" + TTL_DATATYPE + "|" + LANGUAGE + ")?)" );
317- final public static Pattern TTL_OBJECT = Pattern .compile ("^(?:" + TTL_PREFIX_NS + "([^,; \\ t]+)([,;\\ .]?)|" + IRI + "|" + BNODE + "|" + TTL_LITERAL + ")" + WSO );
318427 }
319428
320429 /**
@@ -350,38 +459,33 @@ public static RDFDataset parseNQuads(String input) throws JSONLDProcessingError
350459 // get subject
351460 RDFDataset .Node subject ;
352461 if (match .group (1 ) != null ) {
353- subject = new RDFDataset .IRI (match .group (1 ));
462+ subject = new RDFDataset .IRI (unescape ( match .group (1 ) ));
354463 } else {
355- subject = new RDFDataset .BlankNode (match .group (2 ));
464+ subject = new RDFDataset .BlankNode (unescape ( match .group (2 ) ));
356465 }
357466
358467 // get predicate
359- RDFDataset .Node predicate = new RDFDataset .IRI (match .group (3 ));
468+ RDFDataset .Node predicate = new RDFDataset .IRI (unescape ( match .group (3 ) ));
360469
361470 // get object
362471 RDFDataset .Node object ;
363472 if (match .group (4 ) != null ) {
364- object = new RDFDataset .IRI (match .group (4 ));
473+ object = new RDFDataset .IRI (unescape ( match .group (4 ) ));
365474 } else if (match .group (5 ) != null ) {
366- object = new RDFDataset .BlankNode (match .group (5 ));
475+ object = new RDFDataset .BlankNode (unescape ( match .group (5 ) ));
367476 } else {
368- final String language = match .group (8 );
369- final String datatype = match .group (7 ) != null ? match .group (7 ) : match .group (8 ) != null ? RDF_LANGSTRING : XSD_STRING ;
370- final String unescaped = match .group (6 )
371- .replaceAll ("\\ \\ \\ \\ " , "\\ \\ " )
372- .replaceAll ("\\ \\ t" , "\\ t" )
373- .replaceAll ("\\ \\ n" , "\\ n" )
374- .replaceAll ("\\ \\ r" , "\\ r" )
375- .replaceAll ("\\ \\ \" " , "\\ \" " );
477+ final String language = unescape (match .group (8 ));
478+ final String datatype = match .group (7 ) != null ? unescape (match .group (7 )) : match .group (8 ) != null ? RDF_LANGSTRING : XSD_STRING ;
479+ final String unescaped = unescape (match .group (6 ));
376480 object = new RDFDataset .Literal (unescaped , datatype , language );
377481 }
378482
379483 // get graph name ('@default' is used for the default graph)
380484 String name = "@default" ;
381485 if (match .group (9 ) != null ) {
382- name = match .group (9 );
486+ name = unescape ( match .group (9 ) );
383487 } else if (match .group (10 ) != null ) {
384- name = match .group (10 );
488+ name = unescape ( match .group (10 ) );
385489 }
386490
387491 RDFDataset .Quad triple = new RDFDataset .Quad (subject , predicate , object , name );
0 commit comments