Skip to content
This repository was archived by the owner on Sep 25, 2024. It is now read-only.

Commit 428c28b

Browse files
committed
insert fingerprint data into the secondary tables on the fly
1 parent 86a616d commit 428c28b

File tree

2 files changed

+98
-27
lines changed

2 files changed

+98
-27
lines changed

RaiseWikibase/dbconnection.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ def __init__(self):
2121

2222
self.create_content_models()
2323
self.model_ids = self.get_content_models()
24+
self.create_wbt_types()
25+
self.wbt_types = self.get_wbt_types()
2426

2527
def mysql_connect(self):
2628
"""Helper function connecting to SQL database."""
@@ -112,6 +114,24 @@ def bot_schema(self):
112114
cur.close()
113115
return out
114116

117+
def create_wbt_types(self):
118+
"""Creates three wbt_types: label, description and alias"""
119+
cur = self.conn.cursor()
120+
q = "INSERT IGNORE INTO wbt_type (wby_name ) VALUES('label'),('description'),('alias')"
121+
cur.execute(q)
122+
self.conn.commit()
123+
cur.close()
124+
125+
def get_wbt_types(self):
126+
"""Returns the existing wbt_types from wbt_type-table"""
127+
cur = self.conn.cursor()
128+
q = "SELECT * FROM wbt_type"
129+
cur.execute(q)
130+
out = dict((y.decode('utf-8'), x) for x, y in cur.fetchall())
131+
self.conn.commit()
132+
cur.close()
133+
return out
134+
115135
def create_content_models(self):
116136
"""Creates a few content models"""
117137
cur = self.conn.cursor()
@@ -195,6 +215,53 @@ def get_content_id(self):
195215
cur.close()
196216
return content_id
197217

218+
def get_wbtl_id(self, wbtl_type_id=None, wbxl_language=None, wbx_text=None, wbxl_id=None):
219+
"""Returns wbtl_id (int) in wbt_term_in_lang-table for given wbtl_type_id (int),
220+
wbxl_language (language code), wbx_text (str) and wbxl_id (int)"""
221+
cur = self.conn.cursor()
222+
try:
223+
cur.execute("""SELECT wbtl_id FROM wbt_term_in_lang, wbt_text_in_lang,
224+
wbt_text WHERE wbtl_text_in_lang_id=wbxl_id AND wbtl_type_id=%s
225+
AND wbxl_language=%s AND wbxl_text_id=wbx_id AND wbx_text=%s""",
226+
[wbtl_type_id, wbxl_language, wbx_text])
227+
wbtl_id = cur.fetchone()[0]
228+
except:
229+
cur.execute("INSERT INTO wbt_term_in_lang VALUES(NULL,%s,%s)", [wbtl_type_id, wbxl_id])
230+
cur.execute("SELECT LAST_INSERT_ID()")
231+
wbtl_id = cur.fetchone()[0]
232+
cur.close()
233+
return wbtl_id
234+
235+
def get_wbxl_id(self, wbxl_language=None, wbx_text=None, wbx_id=None):
236+
"""Returns wbxl_id (int) in wbt_text_in_lang-table for given
237+
wbxl_language (language code), wbx_text (str) and wbx_id (int)"""
238+
cur = self.conn.cursor()
239+
try:
240+
cur.execute("""SELECT wbxl_id FROM wbt_text_in_lang, wbt_text
241+
WHERE wbxl_language=%s AND wbxl_text_id=wbx_id AND wbx_text=%s""",
242+
[wbxl_language, wbx_text])
243+
wbxl_id = cur.fetchone()[0]
244+
except:
245+
cur.execute("INSERT INTO wbt_text_in_lang VALUES(NULL,%s,%s)",
246+
[wbxl_language, wbx_id])
247+
cur.execute("SELECT LAST_INSERT_ID()")
248+
wbxl_id = cur.fetchone()[0]
249+
cur.close()
250+
return wbxl_id
251+
252+
def get_wbx_id(self, wbx_text=None):
253+
"""Returns wbx_id (int) in wbt_text-table for given wbx_text (str)"""
254+
cur = self.conn.cursor()
255+
try:
256+
cur.execute("SELECT wbx_id FROM wbt_text WHERE wbx_text=%s", [wbx_text])
257+
wbx_id = cur.fetchone()[0]
258+
except:
259+
cur.execute("INSERT INTO wbt_text VALUES(NULL,%s)", [wbx_text])
260+
cur.execute("SELECT LAST_INSERT_ID()")
261+
wbx_id = cur.fetchone()[0]
262+
cur.close()
263+
return wbx_id
264+
198265
def get_model_id(self, content_model=None):
199266
"""Returns model_id (int) for the given content_model in content_models-table"""
200267
cur = self.conn.cursor()
@@ -382,3 +449,22 @@ def insert(self, text_id=None, text=None, page_id=None, page_title=None,
382449
cur.execute(q9)
383450
cur.close()
384451
pass
452+
453+
def insert_secondary(self, fingerprint=None, new_eid=None, content_model=None):
454+
"""Inserts fingerprint data into 5 (4 per item or property) secondary tables"""
455+
cur = self.conn.cursor()
456+
for key, v in fingerprint.items():
457+
wby_id = self.wbt_types.get(key) # key is 'label', 'alias' or 'description'; wby_id is the corresponding id
458+
for lang, values in v.items():
459+
values = [values] if isinstance(values, dict) else values # for labels and descriptions only, to make them consistent with aliases
460+
for value in values:
461+
wbx_text = self.conn.escape_string(value['value'])[:255] # escaping & truncating
462+
wbx_id = self.get_wbx_id(wbx_text=wbx_text) # wbt_text
463+
wbxl_id = self.get_wbxl_id(wbxl_language=lang, wbx_text=wbx_text, wbx_id=wbx_id) # wbt_text_in_lang
464+
wbtl_id = self.get_wbtl_id(wbtl_type_id=wby_id, wbxl_language=lang, wbx_text=wbx_text, wbxl_id=wbxl_id) # wbt_term_in_lang
465+
if content_model=='wikibase-item':
466+
cur.execute("INSERT INTO wbt_item_terms VALUES(NULL,%s,%s)", [new_eid, wbtl_id])
467+
if content_model=='wikibase-property':
468+
cur.execute("INSERT IGNORE INTO wbt_property_terms VALUES(NULL,%s,%s)", [new_eid, wbtl_id])
469+
cur.close()
470+
pass

RaiseWikibase/raiser.py

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,12 @@ def page(connection=None, content_model=None, namespace=None, text=None, page_ti
2929
"""
3030
# 1. Check whether the content_model is valid.
3131
if content_model in ['wikibase-item', 'wikibase-property', 'wikibase-lexeme']:
32-
# 2. For structured data find the namespace, etype (entity type), eid
32+
# 2.1 Take labels, descriptions and aliases from text for the secondary tables
33+
fingerprint = {}
34+
fingerprint['label'] = text.get('labels')
35+
fingerprint['description'] = text.get('descriptions')
36+
fingerprint['alias'] = text.get('aliases')
37+
# 2.2 For structured data find the namespace, etype (entity type), eid
3338
# and page_title. Convert text from dict to str.
3439
namespace = namespaces[content_model]
3540
etype = content_model.replace('wikibase-', '')
@@ -75,6 +80,10 @@ def page(connection=None, content_model=None, namespace=None, text=None, page_ti
7580
content_id=content_id, model_id=model_id,
7681
content_model=content_model, namespace=namespace,
7782
rev_id=rev_id, new=new, ip=ip)
83+
# 7. Insert fingerprint (labels, descriptions & aliases) into the secondary tables.
84+
if content_model in ['wikibase-item', 'wikibase-property']:
85+
connection.insert_secondary(fingerprint=fingerprint, new_eid=new_eid,
86+
content_model=content_model)
7887
# For structured data update counters in 'wb_id_counters'-table.
7988
if new and (content_model in ['wikibase-item', 'wikibase-property', 'wikibase-lexeme']):
8089
connection.update_wb_id_counters(new_eid=new_eid, content_model=content_model)
@@ -134,30 +143,11 @@ def create_bot(bot_name='bot'):
134143

135144

136145
def building_indexing():
137-
"""Builds the secondary tables and ElasticSearch index"""
146+
"""Builds the ElasticSearch index"""
138147
connection = DBConnection()
139148
container = connection.docker_wikibase
140149
connection.conn.close()
141150

142-
# Run update.php
143-
#execute_shell(
144-
# 'docker exec ' +
145-
# container +
146-
# ' bash "-c" "php maintenance/update.php --force --quick"')
147-
# Build the secondary tables for items and properties
148-
execute_shell(
149-
'docker exec ' +
150-
container +
151-
' bash "-c" "php extensions/Wikibase/repo/maintenance/rebuildItemTerms.php --sleep 0.1"')
152-
execute_shell(
153-
'docker exec ' +
154-
container +
155-
' bash "-c" "php extensions/Wikibase/repo/maintenance/rebuildPropertyTerms.php --sleep 0.01"')
156-
# Rebuild Property Info
157-
execute_shell(
158-
'docker exec ' +
159-
container +
160-
' bash "-c" "php extensions/Wikibase/repo/maintenance/rebuildPropertyInfo.php --rebuild-all --force"')
161151
# CirrusSearch indexing. For huge tables use parallelization as explained at
162152
# https://github.com/wikimedia/mediawiki-extensions-CirrusSearch/blob/master/README
163153
execute_shell(
@@ -167,9 +157,4 @@ def building_indexing():
167157
execute_shell(
168158
'docker exec ' +
169159
container +
170-
' bash "-c" "php extensions/CirrusSearch/maintenance/ForceSearchIndex.php --skipParse"')
171-
# Run runJobs.php after indexing. See https://www.mediawiki.org/wiki/Manual:RunJobs.php
172-
execute_shell(
173-
'docker exec ' +
174-
container +
175-
' bash "-c" "php maintenance/runJobs.php"')
160+
' bash "-c" "php extensions/CirrusSearch/maintenance/ForceSearchIndex.php --skipParse"')

0 commit comments

Comments
 (0)