Skip to content

Commit 83ea999

Browse files
committed
Detabify
1 parent 0c72fcb commit 83ea999

File tree

1 file changed

+132
-132
lines changed

1 file changed

+132
-132
lines changed

htmldumper.js

Lines changed: 132 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -1,171 +1,171 @@
11
#!node --harmony --harmony_generators
22
"use strict";
33
var suspend = require('suspend'),
4-
resume = suspend.resume,
5-
request = require('request'),
6-
async = require('async'),
7-
http = require('http'),
8-
fs = require('fs');
4+
resume = suspend.resume,
5+
request = require('request'),
6+
async = require('async'),
7+
http = require('http'),
8+
fs = require('fs');
99

1010
// Higher per-host parallelism
1111
var maxConcurrency = 50;
1212
http.globalAgent.maxSockets = maxConcurrency;
1313

1414
// retrying URL get helper
1515
function* getURL (url) {
16-
var resp,
17-
wait = 0.1; // 10 retries, 0.1 * 2^10 = 102.4
18-
while (wait < 110) {
19-
if (resp && resp.statusCode === 200) {
20-
return resp.body;
21-
} else if (resp && resp.statusCode !== 503) {
22-
throw new Error(resp.statusCode);
23-
} else {
24-
// retry after waiting for a bit
25-
if (resp !== undefined) {
26-
yield setTimeout(resume(), wait);
27-
}
28-
try {
29-
resp = yield request.get(url, { timeout: 40*1000 }, resume());
30-
} catch (e) {
31-
console.error(e);
32-
resp = null;
33-
}
34-
wait = wait * 2;
35-
}
36-
}
37-
throw new Error('getURL failed:', url);
16+
var resp,
17+
wait = 0.1; // 10 retries, 0.1 * 2^10 = 102.4
18+
while (wait < 110) {
19+
if (resp && resp.statusCode === 200) {
20+
return resp.body;
21+
} else if (resp && resp.statusCode !== 503) {
22+
throw new Error(resp.statusCode);
23+
} else {
24+
// retry after waiting for a bit
25+
if (resp !== undefined) {
26+
yield setTimeout(resume(), wait);
27+
}
28+
try {
29+
resp = yield request.get(url, { timeout: 40*1000 }, resume());
30+
} catch (e) {
31+
console.error(e);
32+
resp = null;
33+
}
34+
wait = wait * 2;
35+
}
36+
}
37+
throw new Error('getURL failed:', url);
3838
}
3939

4040
function* getArticles (apiURL, namespace, next) {
41-
var articles = [];
41+
var articles = [];
4242

43-
var url = apiURL + '?action=query&generator=allpages&gapfilterredir=nonredirects'
44-
+ '&gaplimit=500&prop=revisions&gapnamespace='
45-
+ namespace + '&format=json&gapcontinue=' + encodeURIComponent( next );
46-
console.log(url);
47-
try {
48-
var res = JSON.parse(yield* getURL(url)),
49-
articleChunk = res.query.pages;
50-
Object.keys(articleChunk).forEach( function(key) {
51-
var article = articleChunk[key];
52-
if ( article.revisions !== undefined ) {
53-
var title = article.title.replace( / /g, '_' );
54-
articles.push([title, article.revisions[0].revid]);
55-
}
56-
});
57-
next = res['query-continue'].allpages.gapcontinue;
58-
// XXX
59-
//next = 'finished';
60-
} catch(e) {
61-
console.error('Error in getArticles:', e);
62-
}
63-
return { articles: articles, next: next || '' };
43+
var url = apiURL + '?action=query&generator=allpages&gapfilterredir=nonredirects'
44+
+ '&gaplimit=500&prop=revisions&gapnamespace='
45+
+ namespace + '&format=json&gapcontinue=' + encodeURIComponent( next );
46+
console.log(url);
47+
try {
48+
var res = JSON.parse(yield* getURL(url)),
49+
articleChunk = res.query.pages;
50+
Object.keys(articleChunk).forEach( function(key) {
51+
var article = articleChunk[key];
52+
if ( article.revisions !== undefined ) {
53+
var title = article.title.replace( / /g, '_' );
54+
articles.push([title, article.revisions[0].revid]);
55+
}
56+
});
57+
next = res['query-continue'].allpages.gapcontinue;
58+
// XXX
59+
//next = 'finished';
60+
} catch(e) {
61+
console.error('Error in getArticles:', e);
62+
}
63+
return { articles: articles, next: next || '' };
6464
}
6565

6666
function* dumpArticle (prefix, title, oldid) {
67-
var dirName = prefix + '/' + encodeURIComponent(title),
68-
fileName = dirName + '/' + oldid;
69-
try {
70-
// Check if we already have this article revision
71-
var fileStats = yield fs.stat(fileName, resume());
72-
if (fileStats && fileStats.isFile()) {
73-
// We already have the article, nothing to do.
74-
// XXX: Also track / check last-modified time for template
75-
// re-expansions without revisions change
76-
console.log('Exists:', title, oldid);
77-
return;
78-
}
79-
} catch (e) { }
80-
console.log('Dumping', title, oldid);
81-
var body = yield* getURL('http://parsoid-lb.eqiad.wikimedia.org/'
82-
+ prefix + '/' + encodeURIComponent(title) + '?oldid=' + oldid);
83-
try {
84-
yield fs.mkdir(dirName, resume());
85-
} catch (e) {}
67+
var dirName = prefix + '/' + encodeURIComponent(title),
68+
fileName = dirName + '/' + oldid;
69+
try {
70+
// Check if we already have this article revision
71+
var fileStats = yield fs.stat(fileName, resume());
72+
if (fileStats && fileStats.isFile()) {
73+
// We already have the article, nothing to do.
74+
// XXX: Also track / check last-modified time for template
75+
// re-expansions without revisions change
76+
console.log('Exists:', title, oldid);
77+
return;
78+
}
79+
} catch (e) { }
80+
console.log('Dumping', title, oldid);
81+
var body = yield* getURL('http://parsoid-lb.eqiad.wikimedia.org/'
82+
+ prefix + '/' + encodeURIComponent(title) + '?oldid=' + oldid);
83+
try {
84+
yield fs.mkdir(dirName, resume());
85+
} catch (e) {}
8686

87-
// strip data-parsoid
88-
body = body.replace(/ ?data-parsoid=(?:'[^']+'|"[^"]+"|\\".*?\\"|&#39;.*?&#39;)/g, '');
87+
// strip data-parsoid
88+
body = body.replace(/ ?data-parsoid=(?:'[^']+'|"[^"]+"|\\".*?\\"|&#39;.*?&#39;)/g, '');
8989
var files = yield fs.readdir(dirName, resume());
9090
files.forEach(function(file, cb) {
9191
fs.unlink(file, cb);
9292
});
93-
return yield fs.writeFile(fileName, body, resume());
93+
return yield fs.writeFile(fileName, body, resume());
9494
}
9595

9696
function* par(functions) {
97-
var outstanding = functions.length,
98-
cb = resume(),
99-
fnCB = function (err, res) {
100-
if (err) {
101-
return cb(err);
102-
}
103-
outstanding--;
104-
if (!outstanding) {
105-
cb(null);
106-
}
107-
};
108-
functions.forEach(function(fun) {
109-
suspend.async(fun)(fnCB);
110-
});
111-
yield null;
97+
var outstanding = functions.length,
98+
cb = resume(),
99+
fnCB = function (err, res) {
100+
if (err) {
101+
return cb(err);
102+
}
103+
outstanding--;
104+
if (!outstanding) {
105+
cb(null);
106+
}
107+
};
108+
functions.forEach(function(fun) {
109+
suspend.async(fun)(fnCB);
110+
});
111+
yield null;
112112
}
113113

114114

115115

116116
function* makeDump (apiURL, prefix, ns) {
117-
// Set up directories
118-
try {
119-
fs.mkdirSync(prefix);
120-
} catch (e) {}
117+
// Set up directories
118+
try {
119+
fs.mkdirSync(prefix);
120+
} catch (e) {}
121121

122-
var next = '',
123-
nextArticles,
124-
articles;
125-
function* getNextArticles() {
126-
var articleResult = yield* getArticles(apiURL, ns, next);
127-
nextArticles = articleResult.articles;
128-
next = articleResult.next;
129-
}
130-
function* dumpNextArticles(articles) {
131-
var dumpArticleFn = suspend.async(function* (article) {
132-
var title = article[0],
133-
oldid = article[1];
134-
try {
135-
return yield* dumpArticle(prefix, title, oldid);
136-
} catch (e) {
137-
console.error('Error in makeDump:', title, oldid, e.stack);
138-
}
139-
});
140-
yield async.eachLimit(articles, maxConcurrency, dumpArticleFn, resume());
141-
}
142-
yield* getNextArticles();
143-
do {
144-
yield* par([getNextArticles, function* () { yield* dumpNextArticles(nextArticles) }]);
145-
} while (next !== 'finished');
122+
var next = '',
123+
nextArticles,
124+
articles;
125+
function* getNextArticles() {
126+
var articleResult = yield* getArticles(apiURL, ns, next);
127+
nextArticles = articleResult.articles;
128+
next = articleResult.next;
129+
}
130+
function* dumpNextArticles(articles) {
131+
var dumpArticleFn = suspend.async(function* (article) {
132+
var title = article[0],
133+
oldid = article[1];
134+
try {
135+
return yield* dumpArticle(prefix, title, oldid);
136+
} catch (e) {
137+
console.error('Error in makeDump:', title, oldid, e.stack);
138+
}
139+
});
140+
yield async.eachLimit(articles, maxConcurrency, dumpArticleFn, resume());
141+
}
142+
yield* getNextArticles();
143+
do {
144+
yield* par([getNextArticles, function* () { yield* dumpNextArticles(nextArticles) }]);
145+
} while (next !== 'finished');
146146
}
147147

148148
if (module.parent === null) {
149-
var argv = require('yargs')
150-
.usage('Create a HTML dump in a subdir\nUsage: $0'
149+
var argv = require('yargs')
150+
.usage('Create a HTML dump in a subdir\nUsage: $0'
151151
+ '\nExample: node --harmony htmldumper.js --prefix enwiki --ns 0 --apiURL http://en.wikipedia.org/w/api.php')
152-
.demand(['apiURL', 'prefix', 'ns'])
153-
//.default('apiURL', 'http://en.wikipedia.org/w/api.php')
154-
//.default('prefix', 'enwiki')
155-
//.default('ns', '0')
156-
.argv;
152+
.demand(['apiURL', 'prefix', 'ns'])
153+
//.default('apiURL', 'http://en.wikipedia.org/w/api.php')
154+
//.default('prefix', 'enwiki')
155+
//.default('ns', '0')
156+
.argv;
157157

158-
suspend.async(makeDump)(
159-
argv.apiURL,
160-
argv.prefix,
161-
Number(argv.ns),
162-
function(err, res) {
163-
if (err) {
164-
console.error('Error in main;', err);
165-
} else {
166-
console.log('Dump done.');
167-
}
168-
});
158+
suspend.async(makeDump)(
159+
argv.apiURL,
160+
argv.prefix,
161+
Number(argv.ns),
162+
function(err, res) {
163+
if (err) {
164+
console.error('Error in main;', err);
165+
} else {
166+
console.log('Dump done.');
167+
}
168+
});
169169
}
170170

171171
module.exports = makeDump;

0 commit comments

Comments
 (0)