@@ -64,6 +64,28 @@ function getArticles (options, res) {
6464 } ) ;
6565}
6666
67+ function checkArticle ( options , title , oldid ) {
68+ var dumpDir = options . saveDir + '/' + options . prefix ;
69+ var dirName = dumpDir + '/' + encodeURIComponent ( title ) ;
70+ var fileName = dirName + '/' + oldid ;
71+ return fs . statAsync ( fileName )
72+ . catch ( function ( e ) {
73+ return false ;
74+ } )
75+ . then ( function ( fileStats ) {
76+ // Check if we already have this article revision
77+ if ( fileStats && fileStats . isFile ( ) ) {
78+ // We already have the article, nothing to do.
79+ // XXX: Also track / check last-modified time for template
80+ // re-expansions without revisions change
81+ console . log ( 'Exists:' , title , oldid ) ;
82+ return true ;
83+ } else {
84+ return false ;
85+ }
86+ } ) ;
87+ }
88+
6789function saveArticle ( options , body , title , oldid ) {
6890 var dumpDir = options . saveDir + '/' + options . prefix ;
6991 var dirName = dumpDir + '/' + encodeURIComponent ( title ) ;
@@ -89,25 +111,37 @@ function saveArticle (options, body, title, oldid) {
89111}
90112
91113function dumpArticle ( options , title , oldid ) {
92- console . log ( 'Dumping' , title , oldid ) ;
93- var url = 'http://' + options . host + '/' + options . prefix
94- + '/v1/page/' + encodeURIComponent ( title ) + '/html/' + oldid ;
95- return preq . get ( {
96- uri : url ,
97- retries : 5 ,
98- timeout : 60000 ,
99- // Request a Buffer by default, don't decode to a String. This
100- // saves CPU cycles, but also a lot of memory as large strings are
101- // stored in the old space of the JS heap while Buffers are stored
102- // outside the JS heap.
103- encoding : null
104- } )
105- . then ( function ( res ) {
106- //console.log('done', title);
107- if ( options . saveDir ) {
108- return saveArticle ( options , res . body , title , oldid ) ;
109- }
110- } ) ;
114+ var checkRevision ;
115+ if ( options . saveDir ) {
116+ checkRevision = checkArticle ( options , title , oldid ) ;
117+ } else {
118+ checkRevision = Promise . resolve ( false ) ;
119+ }
120+
121+ return checkRevision
122+ . then ( function ( checkResult ) {
123+ if ( ! checkResult ) {
124+ console . log ( 'Dumping' , title , oldid ) ;
125+ var url = 'http://' + options . host + '/' + options . prefix
126+ + '/v1/page/' + encodeURIComponent ( title ) + '/html/' + oldid ;
127+ return preq . get ( {
128+ uri : url ,
129+ retries : 5 ,
130+ timeout : 60000 ,
131+ // Request a Buffer by default, don't decode to a String. This
132+ // saves CPU cycles, but also a lot of memory as large strings are
133+ // stored in the old space of the JS heap while Buffers are stored
134+ // outside the JS heap.
135+ encoding : null
136+ } )
137+ . then ( function ( res ) {
138+ //console.log('done', title);
139+ if ( options . saveDir ) {
140+ return saveArticle ( options , res . body , title , oldid ) ;
141+ }
142+ } ) ;
143+ }
144+ } ) ;
111145}
112146
113147// Processes chunks of articles one by one
0 commit comments