|
1 | 1 | "use strict"; |
2 | 2 |
|
3 | | -// Prefer bluebird promise implementation over es6-shim pulled in by prfun |
4 | | -if (!global.Promise) { |
| 3 | +if (!global.Promise || !global.promise.promisify) { |
5 | 4 | global.Promise = require('bluebird'); |
6 | 5 | } |
7 | | -if (!global.Promise.promisify) { |
8 | | - global.Promise.promisifyAll = require('bluebird').promisifyAll; |
9 | | -} |
10 | 6 |
|
11 | 7 | var preq = require('preq'); |
12 | | -var http = require('http'); |
13 | 8 | var fs = Promise.promisifyAll(require('fs')); |
| 9 | +var PromiseStream = require('./PromiseStream'); |
14 | 10 |
|
15 | | -// Higher per-host parallelism |
16 | | -var maxConcurrency = 50; |
17 | | -http.globalAgent.maxSockets = maxConcurrency; |
| 11 | +// Article dump parallelism |
| 12 | +var maxConcurrency = 200; |
18 | 13 |
|
19 | | -function getArticles (apiURL, namespace, next) { |
| 14 | +function getArticles (apiURL, namespace, res) { |
| 15 | + var next = res.next || ''; |
| 16 | + if (next === 'finished') { |
| 17 | + // nothing more to do. |
| 18 | + return Promise.reject('Articles done'); |
| 19 | + } |
20 | 20 |
|
21 | 21 | var url = apiURL + '?action=query&generator=allpages&gapfilterredir=nonredirects' |
22 | 22 | + '&gaplimit=500&prop=revisions&gapnamespace=' |
23 | 23 | + namespace + '&format=json&gapcontinue=' + encodeURIComponent( next ); |
24 | | - console.log(url); |
25 | | - return preq.get(url, { retries: 10 }) |
26 | | - .then(function(res) { |
27 | | - res = res.body; |
28 | | - var articles = []; |
29 | | - var articleChunk = res.query.pages; |
30 | | - Object.keys(articleChunk).forEach( function(key) { |
31 | | - var article = articleChunk[key]; |
32 | | - if ( article.revisions !== undefined ) { |
33 | | - var title = article.title.replace( / /g, '_' ); |
34 | | - articles.push([title, article.revisions[0].revid]); |
35 | | - } |
36 | | - }); |
37 | | - next = res['query-continue'].allpages.gapcontinue; |
38 | | - // XXX |
39 | | - //next = 'finished'; |
40 | | - return { articles: articles, next: next || '' }; |
41 | | - }) |
42 | | - .catch(function(e) { |
43 | | - console.error('Error in getArticles:', e); |
44 | | - throw e; |
| 24 | + //console.log(url); |
| 25 | + |
| 26 | + return preq.get(url, { retries: 10 }) |
| 27 | + .then(function(res) { |
| 28 | + res = res.body; |
| 29 | + var articles = []; |
| 30 | + var articleChunk = res.query.pages; |
| 31 | + Object.keys(articleChunk).forEach( function(key) { |
| 32 | + var article = articleChunk[key]; |
| 33 | + if ( article.revisions !== undefined ) { |
| 34 | + var title = article.title.replace( / /g, '_' ); |
| 35 | + articles.push([title, article.revisions[0].revid]); |
| 36 | + } |
45 | 37 | }); |
| 38 | + next = res['query-continue'].allpages.gapcontinue; |
| 39 | + // XXX |
| 40 | + //next = 'finished'; |
| 41 | + return { |
| 42 | + articles: articles, |
| 43 | + next: next |
| 44 | + }; |
| 45 | + }) |
| 46 | + .catch(function(e) { |
| 47 | + console.error('Error in getArticles:', e); |
| 48 | + throw e; |
| 49 | + }); |
46 | 50 | } |
47 | 51 |
|
| 52 | + |
48 | 53 | function dumpArticle (prefix, title, oldid) { |
49 | 54 | var dirName = prefix + '/' + encodeURIComponent(title), |
50 | 55 | fileName = dirName + '/' + oldid; |
@@ -92,33 +97,63 @@ function makeDump (apiURL, prefix, ns) { |
92 | 97 | fs.mkdirSync(prefix); |
93 | 98 | } catch (e) {} |
94 | 99 |
|
95 | | - function dumpBatch(articleResult) { |
96 | | - var articles = articleResult.articles; |
97 | | - var next = articleResult.next; |
98 | | - Promise.all([ |
99 | | - // Fetch the next batch of oldids while processing the last one |
100 | | - getArticles(apiURL, ns, next), |
101 | | - |
102 | | - Promise.filter(articles, function(article) { |
103 | | - var title = article[0]; |
104 | | - var oldid = article[1]; |
105 | | - return dumpArticle(prefix, title, oldid) |
106 | | - .catch(function(e) { |
107 | | - console.error('Error in makeDump:', title, oldid, e.stack); |
108 | | - }); |
109 | | - }, { concurrency: maxConcurrency }) |
110 | | - ]) |
111 | | - .then(function(results){ |
112 | | - //console.log(results); |
113 | | - var articleResult = results[0]; |
114 | | - if (articleResult.next !== 'finished') { |
115 | | - return dumpBatch(articleResult); |
| 100 | + var articleArgs = { |
| 101 | + apiURL: apiURL, |
| 102 | + namespace: ns, |
| 103 | + next: '' |
| 104 | + }; |
| 105 | + |
| 106 | + var articleStream = new PromiseStream(getArticles.bind(null, apiURL, ns), |
| 107 | + {next: ''}, 10); |
| 108 | + var articles = []; |
| 109 | + var waiters = []; |
| 110 | + |
| 111 | + function processArticles (newArticles) { |
| 112 | + articles = newArticles.articles; |
| 113 | + while(waiters.length && articles.length) { |
| 114 | + waiters.pop().resolve(articles.shift()); |
| 115 | + } |
| 116 | + if (waiters.length) { |
| 117 | + articleStream.next().then(processArticles); |
| 118 | + } |
| 119 | + } |
| 120 | + |
| 121 | + function getArticle() { |
| 122 | + if (articles.length) { |
| 123 | + return Promise.resolve(articles.shift()); |
| 124 | + } else { |
| 125 | + if (!waiters.length) { |
| 126 | + articleStream.next().then(processArticles); |
116 | 127 | } |
| 128 | + return new Promise(function(resolve, reject) { |
| 129 | + waiters.push({resolve: resolve, reject: reject}); |
| 130 | + }); |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + function dumpOne () { |
| 135 | + return getArticle() |
| 136 | + .then(function(article) { |
| 137 | + var title = article[0]; |
| 138 | + var oldid = article[1]; |
| 139 | + return dumpArticle(prefix, title, oldid) |
| 140 | + .catch(function(e) { |
| 141 | + console.error('Error in makeDump:', title, oldid, e.stack); |
| 142 | + }); |
| 143 | + }); |
| 144 | + } |
| 145 | + |
| 146 | + var dumpStream = new PromiseStream(dumpOne, undefined, 10, maxConcurrency); |
| 147 | + |
| 148 | + function loop () { |
| 149 | + return dumpStream.next() |
| 150 | + .then(loop) |
| 151 | + .catch(function(e) { |
| 152 | + console.log(e); |
117 | 153 | }); |
118 | 154 | } |
119 | 155 |
|
120 | | - return getArticles(apiURL, ns, '') |
121 | | - .then(dumpBatch); |
| 156 | + return loop(); |
122 | 157 | } |
123 | 158 |
|
124 | 159 | if (module.parent === null) { |
|
0 commit comments