Skip to content

Commit e58a10e

Browse files
committed
Major refactor & all-wiki dump functionality
- Ability to dump all wikis stored in restbase: bin/dump_restbase - Moved HTML dump script to bin/dump_wiki - Moved the implementation modules to lib/ - Changed sqlite schema slightly: - renamed uuid column to 'tid' - changed body type to TEXT
1 parent 30a0be8 commit e58a10e

File tree

9 files changed

+203
-242
lines changed

9 files changed

+203
-242
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ Parsoid HTML dump script for RESTBase APIs like https://rest.wikimedia.org/.
88
## Usage
99

1010
```
11-
Usage: node ./htmldumper
12-
Example: node htmldumper.js --domain en.wikipedia.org \
11+
Usage: node ./bin/dump_wiki
12+
Example: node ./bin/dump_wiki --domain en.wikipedia.org \
1313
--ns 0 --apiURL http://en.wikipedia.org/w/api.php \
1414
--saveDir /tmp
1515
@@ -51,8 +51,8 @@ updated. The schema currently looks like this:
5151
REATE TABLE data(
5252
title TEXT,
5353
revision INTEGER,
54-
body BLOB,
55-
bigendian_v1_uuid text,
54+
tid TEXT,
55+
body TEXT,
5656
page_id INTEGER,
5757
namespace INTEGER,
5858
timestamp TEXT,

bin/dump_restbase

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env node
2+
"use strict";
3+
4+
var dumpAllWikis = require('../lib/dump_restbase');
5+
6+
var argParser = require('yargs')
7+
.usage('Create HTML dumps in a directoy\n'
8+
+ '\nExample usage:\n$0 --workDir /tmp --dumpDir /tmp')
9+
.options('h', {
10+
alias: 'help',
11+
describe: 'Show help and exit.',
12+
})
13+
.options('v', {
14+
alias: 'verbose',
15+
describe: 'Verbose logging',
16+
})
17+
.options('workDir', {
18+
default: '/tmp',
19+
describe: 'Directory to use for in-progress dump files',
20+
})
21+
.options('dumpDir', {
22+
default: '/tmp',
23+
describe: 'Directory to use for finished dump files',
24+
});
25+
var options = argParser.argv;
26+
27+
if (options.h) {
28+
argParser.showHelp();
29+
process.exit(1);
30+
}
31+
32+
return dumpAllWikis(options);

bin/dump_wiki

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/usr/bin/env node
2+
"use strict";
3+
4+
var P = require('bluebird');
5+
6+
var makeDump = require('../lib/htmldump');
7+
8+
var argParser = require('yargs')
9+
.usage('Create a HTML dump in a subdir\n\n'
10+
+ '\nExample:\n$0 --domain en.wikipedia.org --ns 0 --apiURL http://en.wikipedia.org/w/api.php')
11+
.demand(['apiURL', 'domain', 'ns', 'host'])
12+
.options('h', {
13+
alias: 'help'
14+
})
15+
.alias('domain', 'prefix')
16+
.options('d', {
17+
alias : 'saveDir',
18+
default : ''
19+
})
20+
.options('db', {
21+
alias : 'dataBase',
22+
default : ''
23+
})
24+
.options('verbose', {
25+
default : true
26+
})
27+
//.default('apiURL', 'http://en.wikipedia.org/w/api.php')
28+
//.default('prefix', 'en.wikipedia.org')
29+
//.default('ns', '0')
30+
.default('host', 'http://rest.wikimedia.org');
31+
32+
var argv = argParser.argv;
33+
if (argv.h) {
34+
argParser.showHelp();
35+
process.exit(1);
36+
}
37+
38+
// Strip a trailing slash
39+
argv.host = argv.host.replace(/\/$/, '');
40+
41+
argv.ns = Number(argv.ns);
42+
43+
return makeDump(argv)
44+
.then(function(res) {
45+
console.log('Dump done.');
46+
})
47+
.catch(function(err) {
48+
console.error('Error in main;', err);
49+
});

htmldumper-generator.js

Lines changed: 0 additions & 171 deletions
This file was deleted.
File renamed without changes.

lib/dump_restbase.js

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env node
2+
"use strict";
3+
var P = require('bluebird');
4+
5+
var fs = P.promisifyAll(require('fs'));
6+
var proc = P.promisifyAll(require('child_process'));
7+
var preq = require('preq');
8+
var makeDump = require('./htmldump');
9+
10+
11+
//var dumpDir = '/srv/www/htmldumps';
12+
//var workDir = '/srv/www/htmldumps';
13+
var dumpDir = '/tmp';
14+
var workDir = '/tmp';
15+
16+
function dumpDBName (domain) {
17+
return domain + '.articles.ns0.sqlite3';
18+
}
19+
20+
function dumpWiki(options) {
21+
var domain = options.domain;
22+
var dumpName = dumpDBName(domain);
23+
var workDB = options.workDir + '/' + dumpName;
24+
var dumpDB = options.dumpDir + '/' + dumpName + '.xz';
25+
// If a dump exists, uncompress it & use it as a starting point
26+
var dumpPromise = P.resolve();
27+
if (fs.existsSync(dumpDB)) {
28+
dumpPromise = proc.execFileAsync('pixz', ['-d', dumpDB, workDB]);
29+
}
30+
return dumpPromise
31+
.then(function() {
32+
var dumpOptions = {
33+
dataBase: workDB,
34+
apiURL: 'http://' + domain + '/w/api.php',
35+
prefix: domain,
36+
ns: 0,
37+
host: 'http://rest.wikimedia.org',
38+
verbose: options.verbose
39+
};
40+
return makeDump(dumpOptions);
41+
})
42+
.then(function() {
43+
console.log('xz compressing');
44+
proc.execFileAsync('pixz', ['-2', workDB, dumpDB]);
45+
})
46+
.catch(console.log)
47+
.then(function() {
48+
return fs.unlinkAsync(workDB);
49+
})
50+
.catch(function(e) {
51+
console.error(e);
52+
});
53+
}
54+
55+
56+
function dumpAllWikis (options) {
57+
return preq.get({
58+
uri: 'http://rest.wikimedia.org/',
59+
headers: {
60+
accept: 'application/json'
61+
}
62+
})
63+
.then(function(res) {
64+
return P.each(res.body.items, function(domain) {
65+
options.domain = domain;
66+
return dumpWiki(options);
67+
});
68+
})
69+
.then(function() {
70+
console.log('All dumps done.');
71+
});
72+
}
73+
74+
module.exports = dumpAllWikis;
File renamed without changes.

0 commit comments

Comments
 (0)