Skip to content

Commit 9396193

Browse files
committed
Add simple SQLite store
This patch adds a simple SQLite storage backend. The table schema still misses metadata columns, but should be sufficient for meaningful scale testing. Also: - move out stores to separate modules - remove heapdump dependency, as it does no longer work on node 0.10 distributed in Debian Jessie
1 parent 6b66b0e commit 9396193

File tree

4 files changed

+115
-54
lines changed

4 files changed

+115
-54
lines changed

filestore.js

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"use strict";
2+
var Bluebird = require('bluebird');
3+
var fs = Bluebird.promisifyAll(require('fs'));
4+
5+
function FileStore(options) {
6+
this.options = options;
7+
}
8+
9+
FileStore.prototype.checkArticle = function checkArticle (title, oldid) {
10+
var options = this.options;
11+
var dumpDir = options.saveDir + '/' + options.prefix;
12+
var dirName = dumpDir + '/' + encodeURIComponent(title);
13+
var fileName = dirName + '/' + oldid;
14+
return fs.statAsync(fileName)
15+
.catch(function(e) {
16+
return false;
17+
})
18+
.then(function(fileStats) {
19+
// Check if we already have this article revision
20+
if (fileStats && fileStats.isFile()) {
21+
// We already have the article, nothing to do.
22+
// XXX: Also track / check last-modified time for template
23+
// re-expansions without revisions change
24+
console.log('Exists:', title, oldid);
25+
return true;
26+
} else {
27+
return false;
28+
}
29+
});
30+
};
31+
32+
FileStore.prototype.saveArticle = function saveArticle (body, title, oldid) {
33+
var options = this.options;
34+
var dumpDir = options.saveDir + '/' + options.prefix;
35+
var dirName = dumpDir + '/' + encodeURIComponent(title);
36+
var fileName = dirName + '/' + oldid;
37+
return fs.readdirAsync(dirName)
38+
.catch(function(e) {
39+
return fs.mkdirAsync(dumpDir)
40+
.catch(function(){})
41+
.then(function() {
42+
return fs.mkdirAsync(dirName);
43+
})
44+
.then(function() {
45+
return fs.readdirAsync(dirName);
46+
});
47+
})
48+
.then(function(files) {
49+
// Asynchronously unlink other files
50+
files.forEach(function(file) {
51+
fs.unlinkAsync(dirName + '/' + file);
52+
});
53+
return fs.writeFileAsync(fileName, body);
54+
});
55+
};
56+
57+
module.exports = FileStore;

htmldumper.js

Lines changed: 20 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
var Bluebird = require('bluebird');
44

5+
var FileStore = require('./filestore');
6+
var SQLiteStore = require('./sqlitestore');
7+
58
// Enable heap dumps in /tmp on kill -USR2.
69
// See https://github.com/bnoordhuis/node-heapdump/
710
// For node 0.6/0.8: npm install heapdump@0.1.0
@@ -14,7 +17,6 @@ process.on('SIGUSR2', function() {
1417
});
1518

1619
var preq = require('preq');
17-
var fs = Bluebird.promisifyAll(require('fs'));
1820
var PromiseStream = require('./PromiseStream');
1921

2022
// Article dump parallelism
@@ -63,56 +65,10 @@ function getArticles (options, res) {
6365
});
6466
}
6567

66-
function checkArticle (options, title, oldid) {
67-
var dumpDir = options.saveDir + '/' + options.prefix;
68-
var dirName = dumpDir + '/' + encodeURIComponent(title);
69-
var fileName = dirName + '/' + oldid;
70-
return fs.statAsync(fileName)
71-
.catch(function(e) {
72-
return false;
73-
})
74-
.then(function(fileStats) {
75-
// Check if we already have this article revision
76-
if (fileStats && fileStats.isFile()) {
77-
// We already have the article, nothing to do.
78-
// XXX: Also track / check last-modified time for template
79-
// re-expansions without revisions change
80-
console.log('Exists:', title, oldid);
81-
return true;
82-
} else {
83-
return false;
84-
}
85-
});
86-
}
87-
88-
function saveArticle (options, body, title, oldid) {
89-
var dumpDir = options.saveDir + '/' + options.prefix;
90-
var dirName = dumpDir + '/' + encodeURIComponent(title);
91-
var fileName = dirName + '/' + oldid;
92-
return fs.readdirAsync(dirName)
93-
.catch(function(e) {
94-
return fs.mkdirAsync(dumpDir)
95-
.catch(function(){})
96-
.then(function() {
97-
return fs.mkdirAsync(dirName);
98-
})
99-
.then(function() {
100-
return fs.readdirAsync(dirName);
101-
});
102-
})
103-
.then(function(files) {
104-
// Asynchronously unlink other files
105-
files.forEach(function(file) {
106-
fs.unlinkAsync(dirName + '/' + file);
107-
});
108-
return fs.writeFileAsync(fileName, body);
109-
});
110-
}
111-
11268
function dumpArticle (options, title, oldid) {
11369
var checkRevision;
114-
if (options.saveDir) {
115-
checkRevision = checkArticle(options, title, oldid);
70+
if (options.store) {
71+
checkRevision = options.store.checkArticle(title, oldid);
11672
} else {
11773
checkRevision = Bluebird.resolve(false);
11874
}
@@ -138,10 +94,12 @@ function dumpArticle (options, title, oldid) {
13894
})
13995
.then(function(res) {
14096
//console.log('done', title);
141-
if (options.saveDir) {
142-
return saveArticle(options, res.body, title, oldid);
97+
if (options.store) {
98+
return options.store.saveArticle(res.body, title, oldid);
14399
}
144100
});
101+
} else {
102+
console.log('Exists:', title, oldid);
145103
}
146104
});
147105
}
@@ -232,6 +190,10 @@ if (module.parent === null) {
232190
alias : 'saveDir',
233191
default : ''
234192
})
193+
.options('db', {
194+
alias : 'dataBase',
195+
default : ''
196+
})
235197
//.default('apiURL', 'http://en.wikipedia.org/w/api.php')
236198
//.default('prefix', 'en.wikipedia.org')
237199
//.default('ns', '0')
@@ -247,6 +209,13 @@ if (module.parent === null) {
247209
argv.host = argv.host.replace(/\/$/, '');
248210

249211
argv.ns = Number(argv.ns);
212+
213+
if (argv.saveDir) {
214+
argv.store = new FileStore(argv);
215+
} else if (argv.dataBase) {
216+
argv.store = new SQLiteStore(argv);
217+
}
218+
250219
return makeDump(argv)
251220
.then(function(res) {
252221
console.log('Dump done.');

package.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
"dependencies": {
1919
"bluebird": "~2.3.11",
2020
"preq": "~0.3.1",
21+
"sqlite3": "^3.0.5",
2122
"yargs": "~1.2.1"
22-
},
23-
"devDependencies": {
24-
"heapdump": "~0.3.3"
2523
}
2624
}

sqlitestore.js

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"use strict";
2+
var P = require('bluebird');
3+
var fs = P.promisifyAll(require('fs'));
4+
var sqlite3 = P.promisifyAll(require('sqlite3'));
5+
6+
var createTableQuery = 'CREATE TABLE IF NOT EXISTS data('
7+
+ 'title TEXT, revision INTEGER, body BLOB, namespace INTEGER'
8+
+ ', PRIMARY KEY(title ASC, revision DESC)'
9+
+ ')';
10+
var checkQuery = 'select revision from data where title = ? and revision = ? limit 1';
11+
var purgeTitleQuery = 'delete from data where title = ?';
12+
var saveQuery = 'insert into data (title, revision, body, namespace) values (?,?,?,?)';
13+
14+
function SQLiteStore(options) {
15+
this.options = options;
16+
this.db = new sqlite3.Database(options.dataBase);
17+
this.db.exec(createTableQuery);
18+
this.queries = {
19+
check: this.db.prepare(checkQuery),
20+
purgeTitle: this.db.prepare(purgeTitleQuery),
21+
save: this.db.prepare(saveQuery),
22+
};
23+
}
24+
25+
SQLiteStore.prototype.checkArticle = function checkArticle (title, oldid) {
26+
return this.queries.check.getAsync(title, oldid)
27+
};
28+
29+
SQLiteStore.prototype.saveArticle = function saveArticle (body, title, oldid) {
30+
var self = this;
31+
return this.queries.purgeTitle.runAsync(title)
32+
.then(function() {
33+
return self.queries.save.runAsync(title, oldid, body);
34+
});
35+
};
36+
37+
module.exports = SQLiteStore;

0 commit comments

Comments
 (0)