Skip to content

Commit dd86f23

Browse files
committed
See if we can be faster with streams
1 parent fce9d9f commit dd86f23

File tree

2 files changed

+152
-54
lines changed

2 files changed

+152
-54
lines changed

PromiseStream.js

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"use strict";
2+
3+
function PromiseStream (fn, args, size, maxConcurrency) {
4+
this._buf = [];
5+
this._fn = fn;
6+
this._args = args;
7+
this._size = size;
8+
this._concurrency = 0;
9+
this._maxConcurrency = maxConcurrency || 1;
10+
this._waiters = [];
11+
}
12+
13+
PromiseStream.prototype.next = function () {
14+
var self = this;
15+
function startRequest () {
16+
self._concurrency++;
17+
console.log('start', self._concurrency);
18+
var arg;
19+
if (Array.isArray(self._args) && self._args.length) {
20+
arg = self._args.shift();
21+
} else {
22+
arg = self._args;
23+
self._args = undefined;
24+
}
25+
return self._fn(arg).then(handleResult);
26+
}
27+
28+
function handleResult (res) {
29+
console.log('end', self._concurrency);
30+
self._concurrency--;
31+
if (self._waiters.length) {
32+
self._waiters.shift().resolve(res);
33+
} else {
34+
self._buf.push(res);
35+
}
36+
if (!self._args) {
37+
self._args = res;
38+
}
39+
if (self._buf.length < self._size) {
40+
while (self._concurrency < self._maxConcurrency) {
41+
startRequest();
42+
}
43+
}
44+
45+
}
46+
47+
while (self._concurrency < self._maxConcurrency) {
48+
startRequest();
49+
}
50+
51+
if (self._buf.length) {
52+
return Promise.resolve(self._buf.shift());
53+
} else {
54+
return new Promise(function(resolve, reject) {
55+
self._waiters.push({
56+
resolve: resolve,
57+
reject: reject
58+
});
59+
});
60+
}
61+
};
62+
63+
module.exports = PromiseStream;

htmldumper.js

Lines changed: 89 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,55 @@
11
"use strict";
22

3-
// Prefer bluebird promise implementation over es6-shim pulled in by prfun
4-
if (!global.Promise) {
3+
if (!global.Promise || !global.promise.promisify) {
54
global.Promise = require('bluebird');
65
}
7-
if (!global.Promise.promisify) {
8-
global.Promise.promisifyAll = require('bluebird').promisifyAll;
9-
}
106

117
var preq = require('preq');
12-
var http = require('http');
138
var fs = Promise.promisifyAll(require('fs'));
9+
var PromiseStream = require('./PromiseStream');
1410

15-
// Higher per-host parallelism
16-
var maxConcurrency = 50;
17-
http.globalAgent.maxSockets = maxConcurrency;
11+
// Article dump parallelism
12+
var maxConcurrency = 200;
1813

19-
function getArticles (apiURL, namespace, next) {
14+
function getArticles (apiURL, namespace, res) {
15+
var next = res.next || '';
16+
if (next === 'finished') {
17+
// nothing more to do.
18+
return Promise.reject('Articles done');
19+
}
2020

2121
var url = apiURL + '?action=query&generator=allpages&gapfilterredir=nonredirects'
2222
+ '&gaplimit=500&prop=revisions&gapnamespace='
2323
+ namespace + '&format=json&gapcontinue=' + encodeURIComponent( next );
24-
console.log(url);
25-
return preq.get(url, { retries: 10 })
26-
.then(function(res) {
27-
res = res.body;
28-
var articles = [];
29-
var articleChunk = res.query.pages;
30-
Object.keys(articleChunk).forEach( function(key) {
31-
var article = articleChunk[key];
32-
if ( article.revisions !== undefined ) {
33-
var title = article.title.replace( / /g, '_' );
34-
articles.push([title, article.revisions[0].revid]);
35-
}
36-
});
37-
next = res['query-continue'].allpages.gapcontinue;
38-
// XXX
39-
//next = 'finished';
40-
return { articles: articles, next: next || '' };
41-
})
42-
.catch(function(e) {
43-
console.error('Error in getArticles:', e);
44-
throw e;
24+
//console.log(url);
25+
26+
return preq.get(url, { retries: 10 })
27+
.then(function(res) {
28+
res = res.body;
29+
var articles = [];
30+
var articleChunk = res.query.pages;
31+
Object.keys(articleChunk).forEach( function(key) {
32+
var article = articleChunk[key];
33+
if ( article.revisions !== undefined ) {
34+
var title = article.title.replace( / /g, '_' );
35+
articles.push([title, article.revisions[0].revid]);
36+
}
4537
});
38+
next = res['query-continue'].allpages.gapcontinue;
39+
// XXX
40+
//next = 'finished';
41+
return {
42+
articles: articles,
43+
next: next
44+
};
45+
})
46+
.catch(function(e) {
47+
console.error('Error in getArticles:', e);
48+
throw e;
49+
});
4650
}
4751

52+
4853
function dumpArticle (prefix, title, oldid) {
4954
var dirName = prefix + '/' + encodeURIComponent(title),
5055
fileName = dirName + '/' + oldid;
@@ -92,33 +97,63 @@ function makeDump (apiURL, prefix, ns) {
9297
fs.mkdirSync(prefix);
9398
} catch (e) {}
9499

95-
function dumpBatch(articleResult) {
96-
var articles = articleResult.articles;
97-
var next = articleResult.next;
98-
Promise.all([
99-
// Fetch the next batch of oldids while processing the last one
100-
getArticles(apiURL, ns, next),
101-
102-
Promise.filter(articles, function(article) {
103-
var title = article[0];
104-
var oldid = article[1];
105-
return dumpArticle(prefix, title, oldid)
106-
.catch(function(e) {
107-
console.error('Error in makeDump:', title, oldid, e.stack);
108-
});
109-
}, { concurrency: maxConcurrency })
110-
])
111-
.then(function(results){
112-
//console.log(results);
113-
var articleResult = results[0];
114-
if (articleResult.next !== 'finished') {
115-
return dumpBatch(articleResult);
100+
var articleArgs = {
101+
apiURL: apiURL,
102+
namespace: ns,
103+
next: ''
104+
};
105+
106+
var articleStream = new PromiseStream(getArticles.bind(null, apiURL, ns),
107+
{next: ''}, 10);
108+
var articles = [];
109+
var waiters = [];
110+
111+
function processArticles (newArticles) {
112+
articles = newArticles.articles;
113+
while(waiters.length && articles.length) {
114+
waiters.pop().resolve(articles.shift());
115+
}
116+
if (waiters.length) {
117+
articleStream.next().then(processArticles);
118+
}
119+
}
120+
121+
function getArticle() {
122+
if (articles.length) {
123+
return Promise.resolve(articles.shift());
124+
} else {
125+
if (!waiters.length) {
126+
articleStream.next().then(processArticles);
116127
}
128+
return new Promise(function(resolve, reject) {
129+
waiters.push({resolve: resolve, reject: reject});
130+
});
131+
}
132+
}
133+
134+
function dumpOne () {
135+
return getArticle()
136+
.then(function(article) {
137+
var title = article[0];
138+
var oldid = article[1];
139+
return dumpArticle(prefix, title, oldid)
140+
.catch(function(e) {
141+
console.error('Error in makeDump:', title, oldid, e.stack);
142+
});
143+
});
144+
}
145+
146+
var dumpStream = new PromiseStream(dumpOne, undefined, 10, maxConcurrency);
147+
148+
function loop () {
149+
return dumpStream.next()
150+
.then(loop)
151+
.catch(function(e) {
152+
console.log(e);
117153
});
118154
}
119155

120-
return getArticles(apiURL, ns, '')
121-
.then(dumpBatch);
156+
return loop();
122157
}
123158

124159
if (module.parent === null) {

0 commit comments

Comments
 (0)