@@ -8,6 +8,7 @@ import mwparserfromhell
88import pkg_resources
99import re
1010import six
11+ from functools import partial
1112from uuid import uuid1
1213import zlib
1314from contextlib import closing
@@ -149,8 +150,8 @@ cdef class DumpDB:
149150 return ret
150151
151152 @staticmethod
152- def build (dump_reader , out_file , pool_size , chunk_size , init_map_size = 500000000 ,
153- buffer_size = 3000 ):
153+ def build (dump_reader , out_file , pool_size , chunk_size , preprocess_func = None ,
154+ init_map_size = 500000000 , buffer_size = 3000 ):
154155 with closing(lmdb.open(out_file, subdir = False , map_async = True , map_size = init_map_size,
155156 max_dbs = 3 )) as env:
156157 map_size = [init_map_size]
@@ -180,7 +181,8 @@ cdef class DumpDB:
180181 with closing(Pool(pool_size)) as pool:
181182 page_buf = []
182183 redirect_buf = []
183- for ret in pool.imap_unordered(_parse, dump_reader, chunksize = chunk_size):
184+ f = partial(_parse, preprocess_func = preprocess_func)
185+ for ret in pool.imap_unordered(f, dump_reader, chunksize = chunk_size):
184186 if ret:
185187 if ret[0 ] == ' page' :
186188 page_buf.append(ret[1 ])
@@ -202,7 +204,7 @@ cdef class DumpDB:
202204 write_db(redirect_db, redirect_buf)
203205
204206
205- def _parse (WikiPage page ):
207+ def _parse (WikiPage page , preprocess_func ):
206208 cdef int32_t n, start, end
207209 cdef bint abstract
208210 cdef unicode title, text, cur_text, wiki_text
@@ -224,16 +226,19 @@ def _parse(WikiPage page):
224226 cur_links = []
225227 abstract = True
226228
229+ if preprocess_func is None :
230+ preprocess_func = lambda x : x
231+
227232 for node in parsed.nodes:
228233 if isinstance (node, mwparserfromhell.nodes.Text):
229234 for (n, text) in enumerate (six.text_type(node).split(' \n ' )):
230235 if n == 0 :
231- cur_text += text
236+ cur_text += preprocess_func( text)
232237 else :
233238 if cur_text and not cur_text.isspace():
234239 paragraphs.append([cur_text, cur_links, abstract])
235240
236- cur_text = text
241+ cur_text = preprocess_func( text)
237242 cur_links = []
238243
239244 elif isinstance (node, mwparserfromhell.nodes.Wikilink):
@@ -249,6 +254,7 @@ def _parse(WikiPage page):
249254 else :
250255 text = node.title.strip_code()
251256
257+ text = preprocess_func(text)
252258 start = len (cur_text)
253259 cur_text += text
254260 end = len (cur_text)
@@ -259,7 +265,7 @@ def _parse(WikiPage page):
259265 continue
260266
261267 text = node.title.strip_code()
262- cur_text += text
268+ cur_text += preprocess_func( text)
263269
264270 elif isinstance (node, mwparserfromhell.nodes.Tag):
265271 if node.tag not in (' b' , ' i' , ' u' ):
@@ -268,7 +274,7 @@ def _parse(WikiPage page):
268274 continue
269275
270276 text = node.contents.strip_code()
271- cur_text += text
277+ cur_text += preprocess_func( text)
272278
273279 elif isinstance (node, mwparserfromhell.nodes.Heading):
274280 abstract = False
0 commit comments