Skip to content

Commit 7a87313

Browse files
committed
Add a pure spider variant
This only downloads each HTML revision from RESTBase, but does not store it locally. As a next step, it would be nice to extend this so that we can pass in the fs related handler code in htmldumper.js as a parameter.
1 parent dd86f23 commit 7a87313

File tree

2 files changed

+146
-2
lines changed

2 files changed

+146
-2
lines changed

PromiseStream.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ PromiseStream.prototype.next = function () {
1414
var self = this;
1515
function startRequest () {
1616
self._concurrency++;
17-
console.log('start', self._concurrency);
17+
//console.log('start', self._concurrency);
1818
var arg;
1919
if (Array.isArray(self._args) && self._args.length) {
2020
arg = self._args.shift();
@@ -26,7 +26,7 @@ PromiseStream.prototype.next = function () {
2626
}
2727

2828
function handleResult (res) {
29-
console.log('end', self._concurrency);
29+
//console.log('end', self._concurrency);
3030
self._concurrency--;
3131
if (self._waiters.length) {
3232
self._waiters.shift().resolve(res);

htmlspider.js

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"use strict";
2+
3+
if (!global.Promise || !global.promise.promisify) {
4+
global.Promise = require('bluebird');
5+
}
6+
7+
var preq = require('preq');
8+
var fs = Promise.promisifyAll(require('fs'));
9+
var PromiseStream = require('./PromiseStream');
10+
11+
// Article dump parallelism
12+
var maxConcurrency = 70;
13+
14+
function getArticles (apiURL, namespace, res) {
15+
var next = res.next || '';
16+
if (next === 'finished') {
17+
// nothing more to do.
18+
return Promise.reject('Articles done');
19+
}
20+
21+
var url = apiURL + '?action=query&generator=allpages&gapfilterredir=nonredirects'
22+
+ '&gaplimit=500&prop=revisions&gapnamespace='
23+
+ namespace + '&format=json&gapcontinue=' + encodeURIComponent( next );
24+
//console.log(url);
25+
26+
return preq.get(url, { retries: 10 })
27+
.then(function(res) {
28+
res = res.body;
29+
var articles = [];
30+
var articleChunk = res.query.pages;
31+
Object.keys(articleChunk).forEach( function(key) {
32+
var article = articleChunk[key];
33+
if ( article.revisions !== undefined ) {
34+
var title = article.title.replace( / /g, '_' );
35+
articles.push([title, article.revisions[0].revid]);
36+
}
37+
});
38+
next = res['query-continue'].allpages.gapcontinue;
39+
// XXX
40+
//next = 'finished';
41+
return {
42+
articles: articles,
43+
next: next
44+
};
45+
})
46+
.catch(function(e) {
47+
console.error('Error in getArticles:', e);
48+
throw e;
49+
});
50+
}
51+
52+
53+
function dumpArticle (prefix, title, oldid, host) {
54+
console.log('Dumping', title, oldid);
55+
var url = 'http://' + host + '/v1/'
56+
+ prefix + '/pages/' + encodeURIComponent(title) + '/html/' + oldid;
57+
return preq.get({uri: url, retries: 20, timeout: 20000 })
58+
.then(function(res) {
59+
//console.log('done', title);
60+
return;
61+
});
62+
}
63+
64+
65+
function makeDump (apiURL, prefix, ns, host) {
66+
var articleArgs = {
67+
apiURL: apiURL,
68+
namespace: ns,
69+
next: ''
70+
};
71+
72+
var articleStream = new PromiseStream(getArticles.bind(null, apiURL, ns),
73+
{next: ''}, maxConcurrency);
74+
var articles = [];
75+
var waiters = [];
76+
77+
function processArticles (newArticles) {
78+
articles = newArticles.articles;
79+
while(waiters.length && articles.length) {
80+
waiters.pop().resolve(articles.shift());
81+
}
82+
if (waiters.length) {
83+
articleStream.next().then(processArticles);
84+
}
85+
}
86+
87+
function getArticle() {
88+
if (articles.length) {
89+
return Promise.resolve(articles.shift());
90+
} else {
91+
if (!waiters.length) {
92+
articleStream.next().then(processArticles);
93+
}
94+
return new Promise(function(resolve, reject) {
95+
waiters.push({resolve: resolve, reject: reject});
96+
});
97+
}
98+
}
99+
100+
function dumpOne () {
101+
return getArticle()
102+
.then(function(article) {
103+
var title = article[0];
104+
var oldid = article[1];
105+
return dumpArticle(prefix, title, oldid, host)
106+
.catch(function(e) {
107+
console.error('Error in makeDump:', title, oldid, e.stack);
108+
});
109+
});
110+
}
111+
112+
var dumpStream = new PromiseStream(dumpOne, undefined, maxConcurrency, maxConcurrency);
113+
114+
function loop () {
115+
return dumpStream.next()
116+
.then(loop)
117+
.catch(function(e) {
118+
console.log(e);
119+
});
120+
}
121+
122+
return loop();
123+
}
124+
125+
if (module.parent === null) {
126+
var argv = require('yargs')
127+
.usage('Create a HTML dump in a subdir\nUsage: $0'
128+
+ '\nExample: node htmldumper.js --prefix enwiki --ns 0 --apiURL http://en.wikipedia.org/w/api.php')
129+
.demand(['apiURL', 'prefix', 'ns', 'host'])
130+
//.default('apiURL', 'http://en.wikipedia.org/w/api.php')
131+
//.default('prefix', 'enwiki')
132+
//.default('ns', '0')
133+
.argv;
134+
135+
return makeDump(argv.apiURL, argv.prefix, Number(argv.ns), argv.host)
136+
.then(function(res) {
137+
console.log('Dump done.');
138+
})
139+
.catch(function(err) {
140+
console.error('Error in main;', err);
141+
});
142+
}
143+
144+
module.exports = makeDump;

0 commit comments

Comments
 (0)