Skip to content

Commit 63f39f1

Browse files
authored
Merge pull request #13 from wikipedia2vec/preprocess_text
Preprocess text
2 parents 82ff3a9 + 3f08e8b commit 63f39f1

File tree

2 files changed

+16
-10
lines changed

2 files changed

+16
-10
lines changed

tests/test_dump_db.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def test_parse(self):
128128
"'''Japan''' is a [[Sovereign state|sovereign]] [[island country|island nation]] in [[East Asia]]",
129129
None
130130
)
131-
ret = dump_db._parse(page)
131+
ret = dump_db._parse(page, None)
132132
eq_('page', ret[0])
133133
eq_(b'Japan', ret[1][0])
134134
paragraph = pickle.loads(zlib.decompress(ret[1][1]))[0][0]
@@ -140,7 +140,7 @@ def test_parse(self):
140140

141141
def test_parse_redirect(self):
142142
page = WikiPage('日本', 'en', '#REDIRECT [[Japan]]', 'Japan')
143-
ret = dump_db._parse(page)
143+
ret = dump_db._parse(page, None)
144144
eq_('redirect', ret[0])
145145
eq_('日本'.encode('utf-8'), ret[1][0])
146146
eq_(b'Japan', ret[1][1])

wikipedia2vec/dump_db.pyx

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import mwparserfromhell
88
import pkg_resources
99
import re
1010
import six
11+
from functools import partial
1112
from uuid import uuid1
1213
import zlib
1314
from contextlib import closing
@@ -149,8 +150,8 @@ cdef class DumpDB:
149150
return ret
150151

151152
@staticmethod
152-
def build(dump_reader, out_file, pool_size, chunk_size, init_map_size=500000000,
153-
buffer_size=3000):
153+
def build(dump_reader, out_file, pool_size, chunk_size, preprocess_func=None,
154+
init_map_size=500000000, buffer_size=3000):
154155
with closing(lmdb.open(out_file, subdir=False, map_async=True, map_size=init_map_size,
155156
max_dbs=3)) as env:
156157
map_size = [init_map_size]
@@ -180,7 +181,8 @@ cdef class DumpDB:
180181
with closing(Pool(pool_size)) as pool:
181182
page_buf = []
182183
redirect_buf = []
183-
for ret in pool.imap_unordered(_parse, dump_reader, chunksize=chunk_size):
184+
f = partial(_parse, preprocess_func=preprocess_func)
185+
for ret in pool.imap_unordered(f, dump_reader, chunksize=chunk_size):
184186
if ret:
185187
if ret[0] == 'page':
186188
page_buf.append(ret[1])
@@ -202,7 +204,7 @@ cdef class DumpDB:
202204
write_db(redirect_db, redirect_buf)
203205

204206

205-
def _parse(WikiPage page):
207+
def _parse(WikiPage page, preprocess_func):
206208
cdef int32_t n, start, end
207209
cdef bint abstract
208210
cdef unicode title, text, cur_text, wiki_text
@@ -224,16 +226,19 @@ def _parse(WikiPage page):
224226
cur_links = []
225227
abstract = True
226228

229+
if preprocess_func is None:
230+
preprocess_func = lambda x: x
231+
227232
for node in parsed.nodes:
228233
if isinstance(node, mwparserfromhell.nodes.Text):
229234
for (n, text) in enumerate(six.text_type(node).split('\n')):
230235
if n == 0:
231-
cur_text += text
236+
cur_text += preprocess_func(text)
232237
else:
233238
if cur_text and not cur_text.isspace():
234239
paragraphs.append([cur_text, cur_links, abstract])
235240

236-
cur_text = text
241+
cur_text = preprocess_func(text)
237242
cur_links = []
238243

239244
elif isinstance(node, mwparserfromhell.nodes.Wikilink):
@@ -249,6 +254,7 @@ def _parse(WikiPage page):
249254
else:
250255
text = node.title.strip_code()
251256

257+
text = preprocess_func(text)
252258
start = len(cur_text)
253259
cur_text += text
254260
end = len(cur_text)
@@ -259,7 +265,7 @@ def _parse(WikiPage page):
259265
continue
260266

261267
text = node.title.strip_code()
262-
cur_text += text
268+
cur_text += preprocess_func(text)
263269

264270
elif isinstance(node, mwparserfromhell.nodes.Tag):
265271
if node.tag not in ('b', 'i', 'u'):
@@ -268,7 +274,7 @@ def _parse(WikiPage page):
268274
continue
269275

270276
text = node.contents.strip_code()
271-
cur_text += text
277+
cur_text += preprocess_func(text)
272278

273279
elif isinstance(node, mwparserfromhell.nodes.Heading):
274280
abstract = False

0 commit comments

Comments
 (0)