Skip to content

Commit ee4f2c1

Browse files
committed
Allow dumpfiles of in any compression
Turn the compression format into a config variable thereby allowing both gzip and bzip dump files.
1 parent 43680ab commit ee4f2c1

File tree

5 files changed

+70
-47
lines changed

5 files changed

+70
-47
lines changed

DCAT.php

Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ function validateConfig( array $config ) {
4242
}
4343
if ( $config['dumps-enabled'] ) {
4444
array_push( $top, "dump-info" );
45-
$sub["dump-info"] = array( "accessURL", "mediatype", "license" );
45+
$sub["dump-info"] = array( "accessURL", "mediatype", "compression", "license" );
4646
}
4747

4848
// Test
@@ -194,55 +194,70 @@ function writeDistribution( XMLWriter $xml, array $data, $distribId, $prefix, $d
194194
$ids = array();
195195

196196
$allowedMediatypes = $data['config']["$prefix-info"]['mediatype'];
197-
foreach ( $allowedMediatypes as $format => $mediatype ) {
198-
// handle missing (and BETA) dump files
199-
if ( !is_null( $dumpDate ) and !array_key_exists( $format, $data['dumps'][$dumpDate] ) ) {
200-
continue;
201-
}
197+
$allowedCompressiontypes = array( '' => '' ); // dummy array for non-dumps
198+
if ( !is_null( $dumpDate ) ){
199+
$allowedCompressiontypes = $data['config']["$prefix-info"]['compression'];
200+
}
202201

203-
$id = $data['config']['uri'] . '#' . $distribId . $dumpDate . $format;
204-
array_push( $ids, $id );
202+
foreach ( $allowedCompressiontypes as $compression => $compressiontype ) {
203+
foreach ( $allowedMediatypes as $format => $mediatype ) {
204+
// handle missing (and BETA) dump files
205+
if ( !is_null( $dumpDate ) and !array_key_exists( $format . $compression , $data['dumps'][$dumpDate] ) ) {
206+
continue;
207+
}
205208

206-
$xml->startElementNS( 'rdf', 'Description', null );
207-
$xml->writeAttributeNS( 'rdf', 'about', null, $id );
209+
$id = $data['config']['uri'] . '#' . $distribId . $dumpDate . $format . $compression;
210+
array_push( $ids, $id );
208211

209-
$xml->startElementNS( 'rdf', 'type', null );
210-
$xml->writeAttributeNS( 'rdf', 'resource', null,
211-
'http://www.w3.org/ns/dcat#Distribution' );
212-
$xml->endElement();
212+
$xml->startElementNS( 'rdf', 'Description', null );
213+
$xml->writeAttributeNS( 'rdf', 'about', null, $id );
213214

214-
$xml->startElementNS( 'dcterms', 'license', null );
215-
$xml->writeAttributeNS( 'rdf', 'resource', null,
216-
$data['config']["$prefix-info"]['license'] );
217-
$xml->endElement();
218-
219-
if ( is_null( $dumpDate ) ) {
220-
$xml->startElementNS( 'dcat', 'accessURL', null );
215+
$xml->startElementNS( 'rdf', 'type', null );
221216
$xml->writeAttributeNS( 'rdf', 'resource', null,
222-
$data['config']["$prefix-info"]['accessURL'] );
217+
'http://www.w3.org/ns/dcat#Distribution' );
223218
$xml->endElement();
224-
} else {
225-
dumpDistributionExtras( $xml, $data, $dumpDate, $format );
226-
}
227219

228-
$xml->writeElementNS( 'dcterms', 'format', null, $mediatype );
220+
$xml->startElementNS( 'dcterms', 'license', null );
221+
$xml->writeAttributeNS( 'rdf', 'resource', null,
222+
$data['config']["$prefix-info"]['license'] );
223+
$xml->endElement();
229224

230-
// add description in each language
231-
foreach ( $data['i18n'] as $langCode => $langData ) {
232-
if ( array_key_exists( "distribution-$prefix-description", $langData ) ) {
233-
$formatDescription = str_replace(
234-
'$1',
235-
$format,
236-
$langData["distribution-$prefix-description"]
237-
);
238-
$xml->startElementNS( 'dcterms', 'description', null );
239-
$xml->writeAttributeNS( 'xml', 'lang', null, $langCode );
240-
$xml->text( $formatDescription );
225+
if ( is_null( $dumpDate ) ) {
226+
$xml->startElementNS( 'dcat', 'accessURL', null );
227+
$xml->writeAttributeNS( 'rdf', 'resource', null,
228+
$data['config']["$prefix-info"]['accessURL'] );
241229
$xml->endElement();
230+
} else {
231+
dumpDistributionExtras( $xml, $data, $dumpDate, $format . $compression );
242232
}
243-
}
244233

245-
$xml->endElement();
234+
$xml->writeElementNS( 'dcterms', 'format', null, $mediatype );
235+
236+
// add description in each language
237+
foreach ( $data['i18n'] as $langCode => $langData ) {
238+
if ( array_key_exists( "distribution-$prefix-description", $langData ) ) {
239+
$formatDescription = $langData["distribution-$prefix-description"];
240+
if ( !is_null( $dumpDate ) ) {
241+
$formatDescription = str_replace(
242+
'$1',
243+
$format,
244+
$formatDescription
245+
);
246+
$formatDescription = str_replace(
247+
'$2',
248+
$compressiontype,
249+
$formatDescription
250+
);
251+
}
252+
$xml->startElementNS( 'dcterms', 'description', null );
253+
$xml->writeAttributeNS( 'xml', 'lang', null, $langCode );
254+
$xml->text( $formatDescription );
255+
$xml->endElement();
256+
}
257+
}
258+
259+
$xml->endElement();
260+
}
246261
}
247262

248263
return $ids;
@@ -584,8 +599,10 @@ function outputXml( array $data ) {
584599
*/
585600
function scanDump( $dirname, array $data ) {
586601
$testStrings = array();
587-
foreach ( $data['config']['dump-info']['mediatype'] as $fileEnding => $mediatype ) {
588-
$testStrings[$fileEnding] = 'all.' . $fileEnding . '.gz';
602+
foreach ( $data['config']['dump-info']['compression'] as $compression => $compressiontype ) {
603+
foreach ( $data['config']['dump-info']['mediatype'] as $format => $mediatype ) {
604+
$testStrings["$format$compression"] = '-all.' . $format . '.' . $compression;
605+
}
589606
}
590607

591608
$dumps = array();
@@ -594,16 +611,18 @@ function scanDump( $dirname, array $data ) {
594611
foreach ( glob( $dirname . '/[0-9]*', GLOB_ONLYDIR ) as $subdir ) {
595612
// $subdir = testdirNew/20150120
596613
$subDump = array();
597-
foreach ( glob( $subdir . '/*.gz' ) as $filename ) {
614+
foreach ( glob( $subdir . '/*' ) as $filename ) {
598615
// match each file against an expected testString
616+
#@todo change into one loop
599617
foreach ( $testStrings as $fileEnding => $testString ) {
600618
if ( substr( $filename, -strlen( $testString ) ) === $testString ) {
601619
$info = stat( $filename );
602620
$filename = substr( $filename, strlen( $subdir . '/' ) );
603621
$subDump[$fileEnding] = array(
604622
'timestamp' => gmdate( 'Y-m-d', $info['mtime'] ),
605623
'byteSize' => $info['size'],
606-
'filename' => $filename
624+
'filename' => $filename,
625+
'compression' => $compression
607626
);
608627
}
609628
}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Takes into account access through:
99

1010
* Content negotiation (various formats)
1111
* MediaWiki api (various formats)
12-
* Entity dumps e.g. json, ttl (assumes that these are gziped)
12+
* Entity dumps e.g. json, ttl (assumes that these are compressed)
1313

1414
An example result can be found at [lokal-profil / dcatap.rdf](https://gist.github.com/lokal-profil/8086dc6bf2398d84a311).
1515
The live DCAT-AP description of Wikidata can be found [here](https://dumps.wikimedia.org/wikidatawiki/entities/dcatap.rdf).

config.example.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@
4545
"json": "application/json",
4646
"ttl": "text/turtle"
4747
},
48+
"compression": {
49+
"gz": "gzip",
50+
"bz": "bzip"
51+
},
4852
"license": "http://creativecommons.org/publicdomain/zero/1.0/"
4953
}
5054
}

i18n/en.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"dataset-live-title": "Live access",
66
"dataset-live-description": "The live version of the data, includes entities and properties. Only non-deprecated formats are listed as distributions.",
77
"dataset-dump-title": "Entity dump of $1",
8-
"dataset-dump-description": "A static dump of all entites for the given date.",
8+
"dataset-dump-description": "A static dump of all entities for the given date.",
99
"distribution-ld-description": "The Linked Data endpoint. Format is resolved through content negotiation.",
1010
"distribution-api-description": "The MediaWiki API endpoint. Format is given through the \"format\" parameter.",
11-
"distribution-dump-description": "A gziped $1 file."
11+
"distribution-dump-description": "A $1 file, $2 compressed."
1212
}

i18n/qqq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@
1010
"dataset-dump-description": "The description of the entity dump for the given date.",
1111
"distribution-ld-description": "The description of the Linked Data endpoint. For content negotiation see https://en.wikipedia.org/wiki/Content_negotiation",
1212
"distribution-api-description": "The description of the MediaWiki API endpoint. Leave \"format\" untranslated.",
13-
"distribution-dump-description": "The description of a dump file where $1 is the file format."
13+
"distribution-dump-description": "The description of a dump file where $1 is the file format and $2 the compression format."
1414
}

0 commit comments

Comments
 (0)