|
1 | 1 | #!node --harmony --harmony_generators |
2 | 2 | "use strict"; |
3 | 3 | var suspend = require('suspend'), |
4 | | - resume = suspend.resume, |
5 | | - request = require('request'), |
6 | | - async = require('async'), |
7 | | - http = require('http'), |
8 | | - fs = require('fs'); |
| 4 | + resume = suspend.resume, |
| 5 | + request = require('request'), |
| 6 | + async = require('async'), |
| 7 | + http = require('http'), |
| 8 | + fs = require('fs'); |
9 | 9 |
|
10 | 10 | // Higher per-host parallelism |
11 | 11 | var maxConcurrency = 50; |
12 | 12 | http.globalAgent.maxSockets = maxConcurrency; |
13 | 13 |
|
14 | 14 | // retrying URL get helper |
15 | 15 | function* getURL (url) { |
16 | | - var resp, |
17 | | - wait = 0.1; // 10 retries, 0.1 * 2^10 = 102.4 |
18 | | - while (wait < 110) { |
19 | | - if (resp && resp.statusCode === 200) { |
20 | | - return resp.body; |
21 | | - } else if (resp && resp.statusCode !== 503) { |
22 | | - throw new Error(resp.statusCode); |
23 | | - } else { |
24 | | - // retry after waiting for a bit |
25 | | - if (resp !== undefined) { |
26 | | - yield setTimeout(resume(), wait); |
27 | | - } |
28 | | - try { |
29 | | - resp = yield request.get(url, { timeout: 40*1000 }, resume()); |
30 | | - } catch (e) { |
31 | | - console.error(e); |
32 | | - resp = null; |
33 | | - } |
34 | | - wait = wait * 2; |
35 | | - } |
36 | | - } |
37 | | - throw new Error('getURL failed:', url); |
| 16 | + var resp, |
| 17 | + wait = 0.1; // 10 retries, 0.1 * 2^10 = 102.4 |
| 18 | + while (wait < 110) { |
| 19 | + if (resp && resp.statusCode === 200) { |
| 20 | + return resp.body; |
| 21 | + } else if (resp && resp.statusCode !== 503) { |
| 22 | + throw new Error(resp.statusCode); |
| 23 | + } else { |
| 24 | + // retry after waiting for a bit |
| 25 | + if (resp !== undefined) { |
| 26 | + yield setTimeout(resume(), wait); |
| 27 | + } |
| 28 | + try { |
| 29 | + resp = yield request.get(url, { timeout: 40*1000 }, resume()); |
| 30 | + } catch (e) { |
| 31 | + console.error(e); |
| 32 | + resp = null; |
| 33 | + } |
| 34 | + wait = wait * 2; |
| 35 | + } |
| 36 | + } |
| 37 | + throw new Error('getURL failed:', url); |
38 | 38 | } |
39 | 39 |
|
40 | 40 | function* getArticles (apiURL, namespace, next) { |
41 | | - var articles = []; |
| 41 | + var articles = []; |
42 | 42 |
|
43 | | - var url = apiURL + '?action=query&generator=allpages&gapfilterredir=nonredirects' |
44 | | - + '&gaplimit=500&prop=revisions&gapnamespace=' |
45 | | - + namespace + '&format=json&gapcontinue=' + encodeURIComponent( next ); |
46 | | - console.log(url); |
47 | | - try { |
48 | | - var res = JSON.parse(yield* getURL(url)), |
49 | | - articleChunk = res.query.pages; |
50 | | - Object.keys(articleChunk).forEach( function(key) { |
51 | | - var article = articleChunk[key]; |
52 | | - if ( article.revisions !== undefined ) { |
53 | | - var title = article.title.replace( / /g, '_' ); |
54 | | - articles.push([title, article.revisions[0].revid]); |
55 | | - } |
56 | | - }); |
57 | | - next = res['query-continue'].allpages.gapcontinue; |
58 | | - // XXX |
59 | | - //next = 'finished'; |
60 | | - } catch(e) { |
61 | | - console.error('Error in getArticles:', e); |
62 | | - } |
63 | | - return { articles: articles, next: next || '' }; |
| 43 | + var url = apiURL + '?action=query&generator=allpages&gapfilterredir=nonredirects' |
| 44 | + + '&gaplimit=500&prop=revisions&gapnamespace=' |
| 45 | + + namespace + '&format=json&gapcontinue=' + encodeURIComponent( next ); |
| 46 | + console.log(url); |
| 47 | + try { |
| 48 | + var res = JSON.parse(yield* getURL(url)), |
| 49 | + articleChunk = res.query.pages; |
| 50 | + Object.keys(articleChunk).forEach( function(key) { |
| 51 | + var article = articleChunk[key]; |
| 52 | + if ( article.revisions !== undefined ) { |
| 53 | + var title = article.title.replace( / /g, '_' ); |
| 54 | + articles.push([title, article.revisions[0].revid]); |
| 55 | + } |
| 56 | + }); |
| 57 | + next = res['query-continue'].allpages.gapcontinue; |
| 58 | + // XXX |
| 59 | + //next = 'finished'; |
| 60 | + } catch(e) { |
| 61 | + console.error('Error in getArticles:', e); |
| 62 | + } |
| 63 | + return { articles: articles, next: next || '' }; |
64 | 64 | } |
65 | 65 |
|
66 | 66 | function* dumpArticle (prefix, title, oldid) { |
67 | | - var dirName = prefix + '/' + encodeURIComponent(title), |
68 | | - fileName = dirName + '/' + oldid; |
69 | | - try { |
70 | | - // Check if we already have this article revision |
71 | | - var fileStats = yield fs.stat(fileName, resume()); |
72 | | - if (fileStats && fileStats.isFile()) { |
73 | | - // We already have the article, nothing to do. |
74 | | - // XXX: Also track / check last-modified time for template |
75 | | - // re-expansions without revisions change |
76 | | - console.log('Exists:', title, oldid); |
77 | | - return; |
78 | | - } |
79 | | - } catch (e) { } |
80 | | - console.log('Dumping', title, oldid); |
81 | | - var body = yield* getURL('http://parsoid-lb.eqiad.wikimedia.org/' |
82 | | - + prefix + '/' + encodeURIComponent(title) + '?oldid=' + oldid); |
83 | | - try { |
84 | | - yield fs.mkdir(dirName, resume()); |
85 | | - } catch (e) {} |
| 67 | + var dirName = prefix + '/' + encodeURIComponent(title), |
| 68 | + fileName = dirName + '/' + oldid; |
| 69 | + try { |
| 70 | + // Check if we already have this article revision |
| 71 | + var fileStats = yield fs.stat(fileName, resume()); |
| 72 | + if (fileStats && fileStats.isFile()) { |
| 73 | + // We already have the article, nothing to do. |
| 74 | + // XXX: Also track / check last-modified time for template |
| 75 | + // re-expansions without revisions change |
| 76 | + console.log('Exists:', title, oldid); |
| 77 | + return; |
| 78 | + } |
| 79 | + } catch (e) { } |
| 80 | + console.log('Dumping', title, oldid); |
| 81 | + var body = yield* getURL('http://parsoid-lb.eqiad.wikimedia.org/' |
| 82 | + + prefix + '/' + encodeURIComponent(title) + '?oldid=' + oldid); |
| 83 | + try { |
| 84 | + yield fs.mkdir(dirName, resume()); |
| 85 | + } catch (e) {} |
86 | 86 |
|
87 | | - // strip data-parsoid |
88 | | - body = body.replace(/ ?data-parsoid=(?:'[^']+'|"[^"]+"|\\".*?\\"|'.*?')/g, ''); |
| 87 | + // strip data-parsoid |
| 88 | + body = body.replace(/ ?data-parsoid=(?:'[^']+'|"[^"]+"|\\".*?\\"|'.*?')/g, ''); |
89 | 89 | var files = yield fs.readdir(dirName, resume()); |
90 | 90 | files.forEach(function(file, cb) { |
91 | 91 | fs.unlink(file, cb); |
92 | 92 | }); |
93 | | - return yield fs.writeFile(fileName, body, resume()); |
| 93 | + return yield fs.writeFile(fileName, body, resume()); |
94 | 94 | } |
95 | 95 |
|
96 | 96 | function* par(functions) { |
97 | | - var outstanding = functions.length, |
98 | | - cb = resume(), |
99 | | - fnCB = function (err, res) { |
100 | | - if (err) { |
101 | | - return cb(err); |
102 | | - } |
103 | | - outstanding--; |
104 | | - if (!outstanding) { |
105 | | - cb(null); |
106 | | - } |
107 | | - }; |
108 | | - functions.forEach(function(fun) { |
109 | | - suspend.async(fun)(fnCB); |
110 | | - }); |
111 | | - yield null; |
| 97 | + var outstanding = functions.length, |
| 98 | + cb = resume(), |
| 99 | + fnCB = function (err, res) { |
| 100 | + if (err) { |
| 101 | + return cb(err); |
| 102 | + } |
| 103 | + outstanding--; |
| 104 | + if (!outstanding) { |
| 105 | + cb(null); |
| 106 | + } |
| 107 | + }; |
| 108 | + functions.forEach(function(fun) { |
| 109 | + suspend.async(fun)(fnCB); |
| 110 | + }); |
| 111 | + yield null; |
112 | 112 | } |
113 | 113 |
|
114 | 114 |
|
115 | 115 |
|
116 | 116 | function* makeDump (apiURL, prefix, ns) { |
117 | | - // Set up directories |
118 | | - try { |
119 | | - fs.mkdirSync(prefix); |
120 | | - } catch (e) {} |
| 117 | + // Set up directories |
| 118 | + try { |
| 119 | + fs.mkdirSync(prefix); |
| 120 | + } catch (e) {} |
121 | 121 |
|
122 | | - var next = '', |
123 | | - nextArticles, |
124 | | - articles; |
125 | | - function* getNextArticles() { |
126 | | - var articleResult = yield* getArticles(apiURL, ns, next); |
127 | | - nextArticles = articleResult.articles; |
128 | | - next = articleResult.next; |
129 | | - } |
130 | | - function* dumpNextArticles(articles) { |
131 | | - var dumpArticleFn = suspend.async(function* (article) { |
132 | | - var title = article[0], |
133 | | - oldid = article[1]; |
134 | | - try { |
135 | | - return yield* dumpArticle(prefix, title, oldid); |
136 | | - } catch (e) { |
137 | | - console.error('Error in makeDump:', title, oldid, e.stack); |
138 | | - } |
139 | | - }); |
140 | | - yield async.eachLimit(articles, maxConcurrency, dumpArticleFn, resume()); |
141 | | - } |
142 | | - yield* getNextArticles(); |
143 | | - do { |
144 | | - yield* par([getNextArticles, function* () { yield* dumpNextArticles(nextArticles) }]); |
145 | | - } while (next !== 'finished'); |
| 122 | + var next = '', |
| 123 | + nextArticles, |
| 124 | + articles; |
| 125 | + function* getNextArticles() { |
| 126 | + var articleResult = yield* getArticles(apiURL, ns, next); |
| 127 | + nextArticles = articleResult.articles; |
| 128 | + next = articleResult.next; |
| 129 | + } |
| 130 | + function* dumpNextArticles(articles) { |
| 131 | + var dumpArticleFn = suspend.async(function* (article) { |
| 132 | + var title = article[0], |
| 133 | + oldid = article[1]; |
| 134 | + try { |
| 135 | + return yield* dumpArticle(prefix, title, oldid); |
| 136 | + } catch (e) { |
| 137 | + console.error('Error in makeDump:', title, oldid, e.stack); |
| 138 | + } |
| 139 | + }); |
| 140 | + yield async.eachLimit(articles, maxConcurrency, dumpArticleFn, resume()); |
| 141 | + } |
| 142 | + yield* getNextArticles(); |
| 143 | + do { |
| 144 | + yield* par([getNextArticles, function* () { yield* dumpNextArticles(nextArticles) }]); |
| 145 | + } while (next !== 'finished'); |
146 | 146 | } |
147 | 147 |
|
148 | 148 | if (module.parent === null) { |
149 | | - var argv = require('yargs') |
150 | | - .usage('Create a HTML dump in a subdir\nUsage: $0' |
| 149 | + var argv = require('yargs') |
| 150 | + .usage('Create a HTML dump in a subdir\nUsage: $0' |
151 | 151 | + '\nExample: node --harmony htmldumper.js --prefix enwiki --ns 0 --apiURL http://en.wikipedia.org/w/api.php') |
152 | | - .demand(['apiURL', 'prefix', 'ns']) |
153 | | - //.default('apiURL', 'http://en.wikipedia.org/w/api.php') |
154 | | - //.default('prefix', 'enwiki') |
155 | | - //.default('ns', '0') |
156 | | - .argv; |
| 152 | + .demand(['apiURL', 'prefix', 'ns']) |
| 153 | + //.default('apiURL', 'http://en.wikipedia.org/w/api.php') |
| 154 | + //.default('prefix', 'enwiki') |
| 155 | + //.default('ns', '0') |
| 156 | + .argv; |
157 | 157 |
|
158 | | - suspend.async(makeDump)( |
159 | | - argv.apiURL, |
160 | | - argv.prefix, |
161 | | - Number(argv.ns), |
162 | | - function(err, res) { |
163 | | - if (err) { |
164 | | - console.error('Error in main;', err); |
165 | | - } else { |
166 | | - console.log('Dump done.'); |
167 | | - } |
168 | | - }); |
| 158 | + suspend.async(makeDump)( |
| 159 | + argv.apiURL, |
| 160 | + argv.prefix, |
| 161 | + Number(argv.ns), |
| 162 | + function(err, res) { |
| 163 | + if (err) { |
| 164 | + console.error('Error in main;', err); |
| 165 | + } else { |
| 166 | + console.log('Dump done.'); |
| 167 | + } |
| 168 | + }); |
169 | 169 | } |
170 | 170 |
|
171 | 171 | module.exports = makeDump; |
0 commit comments