Skip to content

Commit 3ac4a24

Browse files
authored
Merge pull request #9 from d00rman/url-template
Allow users to specify the URL template to use for fetching articles
2 parents 464bb36 + 6d51556 commit 3ac4a24

File tree

5 files changed

+63
-22
lines changed

5 files changed

+63
-22
lines changed

README.md

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,39 @@ Example:
1616
1717
Options:
1818
--apiURL [required]
19-
--domain, --prefix [required]
2019
--ns [required]
21-
--host [required] [default: "http://rest.wikimedia.org"]
20+
--host [required]
21+
--domain, --prefix [default: ""]
2222
-d, --saveDir [default: ""]
2323
-t, --startTitle [default: ""]
24+
-a, --userAgent [default: "HTMLDumper"]
2425
--db, --dataBase [default: ""]
2526
--verbose [default: true]
27+
-c, --concurrency [default: 50]
28+
-u, --url [default: "{{host}}/{{domain}}/v1/page/html/{title}/{oldid}"]
2629
```
2730

31+
Parameters:
32+
- **`apiURL`**: The location of the Wiki's MW Action API end point.
33+
- **`ns`**: The namespace index to dump.
34+
- **`host`**: The host to send the dump requests to.
35+
- **`domain`**: If the host contains multiple domains, the one to reach.
36+
- **`saveDir`**: If saving the contents of the dump to a directory structure,
37+
this is the path to the root of the directory (see the following section).
38+
- **`startTitle`**: If resuming a Wiki dump, the article title to start with.
39+
- **`userAgent`**: The UserAgent header to use when sending requests. Default:
40+
`HMTLDumper`
41+
- **`dataBase`**: If saving the contents to a SQLite3 database, the path to the
42+
file to save it to (see the next sections).
43+
- **`verbose`**: Be verbose.
44+
- **`concurrency`**: The number of parallel article fetches to do. Default:
45+
`50`.
46+
- **`url`**: The [URL
47+
template](https://github.com/wikimedia/swagger-router#uri-templating) to use
48+
when making requests for each article. The available parameters are: `title`,
49+
`oldid` and all of the options that can be set on the command line (`host`,
50+
`domain`, etc.). Default: `{{host}}/{{domain}}/v1/page/html/{title}/{oldid}`
51+
2852
### Filesystem output
2953

3054
With `--saveDir` as specified in the example above, a directory structure like

bin/dump_restbase

100644100755
File mode changed.

bin/dump_wiki

100644100755
Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ var makeDump = require('../lib/htmldump');
88
var argParser = require('yargs')
99
.usage('Create a HTML dump in a subdir\n\n'
1010
+ '\nExample:\n$0 --domain en.wikipedia.org --ns 0 --apiURL http://en.wikipedia.org/w/api.php')
11-
.demand(['apiURL', 'domain', 'ns', 'host'])
11+
.demand(['apiURL', 'ns', 'host'])
1212
.options('h', {
1313
alias: 'help'
1414
})
15-
.alias('domain', 'prefix')
15+
.options('domain', {
16+
alias: 'prefix',
17+
default: ''
18+
})
1619
.options('d', {
1720
alias : 'saveDir',
1821
default : ''
@@ -36,10 +39,14 @@ var argParser = require('yargs')
3639
alias: 'concurrency',
3740
default: 50
3841
})
42+
.options('u', {
43+
alias: 'url',
44+
default: '{{host}}/{{domain}}/v1/page/html/{title}/{oldid}'
45+
})
3946
//.default('apiURL', 'http://en.wikipedia.org/w/api.php')
4047
//.default('prefix', 'en.wikipedia.org')
4148
//.default('ns', '0')
42-
.default('host', 'http://rest.wikimedia.org');
49+
//.default('host', 'http://rest.wikimedia.org');
4350

4451
var argv = argParser.argv;
4552
if (argv.h) {

lib/htmldump.js

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
require('core-js/shim');
44

55
var P = require('bluebird');
6+
var Template = require('swagger-router').Template;
67

78
var makeFileStore = require('./filestore');
89
var makeSQLiteStore = require('./sqlitestore');
@@ -21,6 +22,9 @@ process.on('SIGUSR2', function() {
2122
var preq = require('preq');
2223
var PromiseStream = require('./PromiseStream');
2324

25+
// the request template used for fetching each individual article
26+
var articleReqTpl;
27+
2428
function getArticles (options, res) {
2529
if (!res || res.next === 'finished') {
2630
// nothing more to do.
@@ -92,24 +96,12 @@ function dumpArticle (options, title, oldid) {
9296
if (options.verbose) {
9397
console.log('Dumping', title, oldid);
9498
}
95-
var url = options.host + '/' + options.prefix
96-
+ '/v1/page/html/' + encodeURIComponent(title) + '/' + oldid;
97-
return preq.get({
98-
uri: url,
99-
headers: {
100-
'user-agent': options.userAgent,
101-
'accept-encoding': 'gzip'
102-
},
103-
retries: 5,
104-
timeout: 60000,
105-
// Request a Buffer by default, don't decode to a String. This
106-
// saves CPU cycles, but also a lot of memory as large strings are
107-
// stored in the old space of the JS heap while Buffers are stored
108-
// outside the JS heap.
109-
encoding: null
110-
})
99+
return preq.get(articleReqTpl.expand({
100+
request: {
101+
params: Object.assign({title: title, oldid: oldid}, options)
102+
}
103+
}))
111104
.then(function(res) {
112-
//console.log('done', title);
113105
if (options.store) {
114106
return options.store.saveArticle(res.body, title, oldid);
115107
}
@@ -220,6 +212,23 @@ function makeDump (options) {
220212
storeSetup = makeSQLiteStore(options);
221213
}
222214

215+
// set up the article request template once on start-up
216+
articleReqTpl = new Template({
217+
method: 'get',
218+
uri: options.url,
219+
headers: {
220+
'user-agent': options.userAgent,
221+
'accept-encoding': 'gzip'
222+
},
223+
retries: 5,
224+
timeout: 60000,
225+
// Request a Buffer by default, don't decode to a String. This
226+
// saves CPU cycles, but also a lot of memory as large strings are
227+
// stored in the old space of the JS heap while Buffers are stored
228+
// outside the JS heap.
229+
encoding: null
230+
});
231+
223232
return storeSetup
224233
.then(function(store) {
225234
options.store = store;

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"core-js": "^1.0.0",
2424
"preq": "~0.3.1",
2525
"sqlite3": "^3.0.5",
26+
"swagger-router": "^0.5.6",
2627
"yargs": "~1.2.1"
2728
}
2829
}

0 commit comments

Comments
 (0)