Skip to content

Commit a07e48c

Browse files
committed
modifies wikidata feature list
1 parent 434ace8 commit a07e48c

File tree

2 files changed

+33
-14
lines changed

2 files changed

+33
-14
lines changed

drafttopic/feature_lists/wikidata.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,40 @@
1+
import re
2+
3+
from revscoring.datasources import Datasource
14
from revscoring.datasources.meta import vectorizers
25
from revscoring.features import wikibase
36
from revscoring.features.meta import aggregators
47

58

9+
QID_RE = re.compile('Q[0-9]+')
10+
611
wikidata_kvs = vectorizers.word2vec.load_gensim_kv(
712
filename="wikidata-20200501-learned_vectors.50_cell.10k.kv", mmap="r")
813

14+
def process_claims_to_words(claims):
15+
words = []
16+
for pid, value in claims:
17+
words.append(pid)
18+
if QID_RE.match(value) is not None:
19+
words.append(value)
20+
return words
921

1022
def vectorize_words(words):
1123
return vectorizers.word2vec.vectorize_words(wikidata_kvs, words)
1224

1325

14-
revision_text_vectors = vectorizers.word2vec(
15-
wikibase.revision.datasources.claims,
26+
claim_words = Datasource(
27+
"wikidata.revision.claim_words",
28+
process_claims_to_words,
29+
depends_on=[wikibase.revision.datasources.claims])
30+
31+
revision_claim_words_vectors = vectorizers.word2vec(
32+
claim_words,
1633
vectorize_words,
1734
name="revision.text.wikidata_vectors")
1835

1936
w2v = aggregators.mean(
20-
revision_text_vectors,
37+
revision_claim_words_vectors,
2138
vector=True,
2239
name="revision.text.wikidata_vectors_mean"
2340
)

drafttopic/utilities/balance_sample.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,17 @@ def balance_sample(observations, lang_code, max_n, output):
7070
def group_labeled_obs(observations, lang_code):
7171
label_obs = defaultdict(list)
7272
for ob in observations:
73-
if lang_code in ob['sitelinks']:
74-
labeled_ob = {
75-
'title': ob['sitelinks'][lang_code]}
76-
elif lang_code == 'wikidata':
77-
labeled_ob = {
78-
'title': ob['qid']}
79-
80-
if lang_code == 'wikidata' or lang_code in ob['sitelinks']:
81-
labeled_ob['taxo_labels'] = ob['taxo_labels']
82-
for taxo_label in ob['taxo_labels']:
83-
label_obs[taxo_label].append(labeled_ob)
73+
if lang_code == 'wikidata':
74+
if ob['qid'] is None:
75+
continue
76+
labeled_ob = {'title': ob['qid']}
77+
else:
78+
if lang_code not in ob['sitelinks']:
79+
continue
80+
labeled_ob = {'title': ob['sitelinks'][lang_code]}
81+
82+
labeled_ob['taxo_labels'] = ob['taxo_labels']
83+
for taxo_label in ob['taxo_labels']:
84+
label_obs[taxo_label].append(labeled_ob)
85+
8486
return label_obs

0 commit comments

Comments
 (0)