Skip to content

Commit e8470d1

Browse files
committed
Merge pull request #11 from Wikimedia-Sverige/bzip
Bzip
2 parents 43680ab + 0119c53 commit e8470d1

File tree

5 files changed

+97
-52
lines changed

5 files changed

+97
-52
lines changed

DCAT.php

Lines changed: 85 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ function validateConfig( array $config ) {
4242
}
4343
if ( $config['dumps-enabled'] ) {
4444
array_push( $top, "dump-info" );
45-
$sub["dump-info"] = array( "accessURL", "mediatype", "license" );
45+
$sub["dump-info"] = array(
46+
"accessURL", "mediatype", "compression", "license"
47+
);
4648
}
4749

4850
// Test
@@ -149,12 +151,12 @@ function makeDataBlob( $config ) {
149151
* @param XmlWriter $xml XML stream to write to
150152
* @param array $data data-blob of i18n and config variables
151153
* @param string|null $dumpDate the date of the dumpfile, null for live data
152-
* @param string $format the fileformat
154+
* @param string $dumpKey the key for the corresponding dump file
153155
*/
154-
function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate, $format ) {
156+
function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate, $dumpKey ) {
155157
$url = str_replace(
156158
'$1',
157-
$dumpDate . '/' . $data['dumps'][$dumpDate][$format]['filename'],
159+
$dumpDate . '/' . $data['dumps'][$dumpDate][$dumpKey]['filename'],
158160
$data['config']['dump-info']['accessURL']
159161
);
160162

@@ -169,16 +171,52 @@ function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate, $format
169171
$xml->startElementNS( 'dcterms', 'issued', null );
170172
$xml->writeAttributeNS( 'rdf', 'datatype', null,
171173
'http://www.w3.org/2001/XMLSchema#date' );
172-
$xml->text( $data['dumps'][$dumpDate][$format]['timestamp'] );
174+
$xml->text( $data['dumps'][$dumpDate][$dumpKey]['timestamp'] );
173175
$xml->endElement();
174176

175177
$xml->startElementNS( 'dcat', 'byteSize', null );
176178
$xml->writeAttributeNS( 'rdf', 'datatype', null,
177179
'http://www.w3.org/2001/XMLSchema#decimal' );
178-
$xml->text( $data['dumps'][$dumpDate][$format]['byteSize'] );
180+
$xml->text( $data['dumps'][$dumpDate][$dumpKey]['byteSize'] );
179181
$xml->endElement();
180182
}
181183

184+
/**
185+
* Add i18n descriptions for a distribution
186+
*
187+
* @param XmlWriter $xml XML stream to write to
188+
* @param array $data data-blob of i18n and config variables
189+
* @param bool $isDump whether this is a dump distribution
190+
* @param string $prefix the type of distribution, one of ld, api or dump
191+
* @param string $format the file format, if dump
192+
* @param string $compression the compression format, if dump
193+
*/
194+
function writeDistributionI18n( XMLWriter $xml, array $data, $isDump,
195+
$prefix, $format, $compression ) {
196+
197+
foreach ( $data['i18n'] as $langCode => $langData ) {
198+
if ( array_key_exists( "distribution-$prefix-description", $langData ) ) {
199+
$formatDescription = $langData["distribution-$prefix-description"];
200+
if ( $isDump ) {
201+
$formatDescription = str_replace(
202+
'$1',
203+
$format,
204+
$formatDescription
205+
);
206+
$formatDescription = str_replace(
207+
'$2',
208+
$compression,
209+
$formatDescription
210+
);
211+
}
212+
$xml->startElementNS( 'dcterms', 'description', null );
213+
$xml->writeAttributeNS( 'xml', 'lang', null, $langCode );
214+
$xml->text( $formatDescription );
215+
$xml->endElement();
216+
}
217+
}
218+
}
219+
182220
/**
183221
* Construct distribution entry for each format in which a distribution
184222
* is available. The DCAT-specification requires each format to be a
@@ -193,56 +231,55 @@ function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate, $format
193231
function writeDistribution( XMLWriter $xml, array $data, $distribId, $prefix, $dumpDate ) {
194232
$ids = array();
195233

234+
$isDump = !is_null( $dumpDate );
196235
$allowedMediatypes = $data['config']["$prefix-info"]['mediatype'];
197-
foreach ( $allowedMediatypes as $format => $mediatype ) {
198-
// handle missing (and BETA) dump files
199-
if ( !is_null( $dumpDate ) and !array_key_exists( $format, $data['dumps'][$dumpDate] ) ) {
200-
continue;
201-
}
236+
$allowedCompressiontypes = array( '' => '' ); // dummy array for non-dumps
237+
if ( $isDump ) {
238+
$allowedCompressiontypes = $data['config']["$prefix-info"]['compression'];
239+
}
202240

203-
$id = $data['config']['uri'] . '#' . $distribId . $dumpDate . $format;
204-
array_push( $ids, $id );
241+
foreach ( $allowedCompressiontypes as $compressionName => $compression ) {
242+
foreach ( $allowedMediatypes as $format => $mediatype ) {
243+
$distributionKey = $format . $compression;
205244

206-
$xml->startElementNS( 'rdf', 'Description', null );
207-
$xml->writeAttributeNS( 'rdf', 'about', null, $id );
245+
// handle missing (and BETA) dump files
246+
if ( $isDump and !array_key_exists( $distributionKey , $data['dumps'][$dumpDate] ) ) {
247+
continue;
248+
}
208249

209-
$xml->startElementNS( 'rdf', 'type', null );
210-
$xml->writeAttributeNS( 'rdf', 'resource', null,
211-
'http://www.w3.org/ns/dcat#Distribution' );
212-
$xml->endElement();
250+
$id = $data['config']['uri'] . '#' . $distribId . $dumpDate . $distributionKey;
251+
array_push( $ids, $id );
213252

214-
$xml->startElementNS( 'dcterms', 'license', null );
215-
$xml->writeAttributeNS( 'rdf', 'resource', null,
216-
$data['config']["$prefix-info"]['license'] );
217-
$xml->endElement();
253+
$xml->startElementNS( 'rdf', 'Description', null );
254+
$xml->writeAttributeNS( 'rdf', 'about', null, $id );
218255

219-
if ( is_null( $dumpDate ) ) {
220-
$xml->startElementNS( 'dcat', 'accessURL', null );
256+
$xml->startElementNS( 'rdf', 'type', null );
221257
$xml->writeAttributeNS( 'rdf', 'resource', null,
222-
$data['config']["$prefix-info"]['accessURL'] );
258+
'http://www.w3.org/ns/dcat#Distribution' );
223259
$xml->endElement();
224-
} else {
225-
dumpDistributionExtras( $xml, $data, $dumpDate, $format );
226-
}
227260

228-
$xml->writeElementNS( 'dcterms', 'format', null, $mediatype );
261+
$xml->startElementNS( 'dcterms', 'license', null );
262+
$xml->writeAttributeNS( 'rdf', 'resource', null,
263+
$data['config']["$prefix-info"]['license'] );
264+
$xml->endElement();
229265

230-
// add description in each language
231-
foreach ( $data['i18n'] as $langCode => $langData ) {
232-
if ( array_key_exists( "distribution-$prefix-description", $langData ) ) {
233-
$formatDescription = str_replace(
234-
'$1',
235-
$format,
236-
$langData["distribution-$prefix-description"]
237-
);
238-
$xml->startElementNS( 'dcterms', 'description', null );
239-
$xml->writeAttributeNS( 'xml', 'lang', null, $langCode );
240-
$xml->text( $formatDescription );
266+
if ( !$isDump ) {
267+
$xml->startElementNS( 'dcat', 'accessURL', null );
268+
$xml->writeAttributeNS( 'rdf', 'resource', null,
269+
$data['config']["$prefix-info"]['accessURL'] );
241270
$xml->endElement();
271+
} else {
272+
dumpDistributionExtras( $xml, $data, $dumpDate, $distributionKey );
242273
}
243-
}
244274

245-
$xml->endElement();
275+
$xml->writeElementNS( 'dcterms', 'format', null, $mediatype );
276+
277+
// add description in each language
278+
writeDistributionI18n( $xml, $data, $isDump, $prefix,
279+
$format, $compressionName );
280+
281+
$xml->endElement();
282+
}
246283
}
247284

248285
return $ids;
@@ -584,8 +621,10 @@ function outputXml( array $data ) {
584621
*/
585622
function scanDump( $dirname, array $data ) {
586623
$testStrings = array();
587-
foreach ( $data['config']['dump-info']['mediatype'] as $fileEnding => $mediatype ) {
588-
$testStrings[$fileEnding] = 'all.' . $fileEnding . '.gz';
624+
foreach ( $data['config']['dump-info']['compression'] as $compression ) {
625+
foreach ( $data['config']['dump-info']['mediatype'] as $format => $mediatype ) {
626+
$testStrings["$format$compression"] = '-all.' . $format . '.' . $compression;
627+
}
589628
}
590629

591630
$dumps = array();
@@ -594,7 +633,7 @@ function scanDump( $dirname, array $data ) {
594633
foreach ( glob( $dirname . '/[0-9]*', GLOB_ONLYDIR ) as $subdir ) {
595634
// $subdir = testdirNew/20150120
596635
$subDump = array();
597-
foreach ( glob( $subdir . '/*.gz' ) as $filename ) {
636+
foreach ( glob( $subdir . '/*' ) as $filename ) {
598637
// match each file against an expected testString
599638
foreach ( $testStrings as $fileEnding => $testString ) {
600639
if ( substr( $filename, -strlen( $testString ) ) === $testString ) {

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Takes into account access through:
99

1010
* Content negotiation (various formats)
1111
* MediaWiki api (various formats)
12-
* Entity dumps e.g. json, ttl (assumes that these are gziped)
12+
* Entity dumps e.g. json, ttl (assumes that these are compressed)
1313

1414
An example result can be found at [lokal-profil / dcatap.rdf](https://gist.github.com/lokal-profil/8086dc6bf2398d84a311).
1515
The live DCAT-AP description of Wikidata can be found [here](https://dumps.wikimedia.org/wikidatawiki/entities/dcatap.rdf).
@@ -102,6 +102,8 @@ Below follows a key by key explanation of the config file.
102102
* `accessURL`: URL to the directory where the *.json.gz* files
103103
reside (`$1` is replaced on the fly by the actual filename),
104104
e.g. *http://example.org/dumps/$1*
105-
* `mediatype`: (`object`) List of media types. In practice this is
106-
always `{"json": "application/json"}` ... for now
105+
* `mediatype`: (`object`) List of media types. e.g.
106+
`{"json": "application/json"}`
107+
* `compression`: (`object`) List of compression formats, in the
108+
format *name:file-ending* e.g. `{"gzip": "gz"}`
107109
* `license`: See ld-info:license above

config.example.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@
4545
"json": "application/json",
4646
"ttl": "text/turtle"
4747
},
48+
"compression": {
49+
"gzip": "gz",
50+
"bzip2": "bz2"
51+
},
4852
"license": "http://creativecommons.org/publicdomain/zero/1.0/"
4953
}
5054
}

i18n/en.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"dataset-live-title": "Live access",
66
"dataset-live-description": "The live version of the data, includes entities and properties. Only non-deprecated formats are listed as distributions.",
77
"dataset-dump-title": "Entity dump of $1",
8-
"dataset-dump-description": "A static dump of all entites for the given date.",
8+
"dataset-dump-description": "A static dump of all entities for the given date.",
99
"distribution-ld-description": "The Linked Data endpoint. Format is resolved through content negotiation.",
1010
"distribution-api-description": "The MediaWiki API endpoint. Format is given through the \"format\" parameter.",
11-
"distribution-dump-description": "A gziped $1 file."
11+
"distribution-dump-description": "A $1 file, $2 compressed."
1212
}

i18n/qqq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@
1010
"dataset-dump-description": "The description of the entity dump for the given date.",
1111
"distribution-ld-description": "The description of the Linked Data endpoint. For content negotiation see https://en.wikipedia.org/wiki/Content_negotiation",
1212
"distribution-api-description": "The description of the MediaWiki API endpoint. Leave \"format\" untranslated.",
13-
"distribution-dump-description": "The description of a dump file where $1 is the file format."
13+
"distribution-dump-description": "The description of a dump file where $1 is the file format and $2 the compression format."
1414
}

0 commit comments

Comments
 (0)