Skip to content

Commit 8c2f3f6

Browse files
author
Aaron Halfaker
committed
Merge branch 'HAKSOAT-topic_models'
2 parents 8ca8074 + 5d6d352 commit 8c2f3f6

27 files changed

+9817
-151
lines changed

Makefile

Lines changed: 39 additions & 151 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,27 @@
11
.DELETE_ON_ERROR:
22

3-
drafttopic_major_minor = 1.3
3+
drafttopic_major_minor = 1.4
44

55
models: \
66
articletopic_models \
77
drafttopic_models
88

99
drafttopic_models: \
10-
models/arwiki.drafttopic.gradient_boosting.model \
11-
models/cswiki.drafttopic.gradient_boosting.model \
12-
models/enwiki.drafttopic.gradient_boosting.model \
13-
models/kowiki.drafttopic.gradient_boosting.model \
14-
models/viwiki.drafttopic.gradient_boosting.model
10+
models/hywiki.drafttopic.gradient_boosting.model
1511

1612
articletopic_models: \
1713
models/arwiki.articletopic.gradient_boosting.model \
1814
models/cswiki.articletopic.gradient_boosting.model \
1915
models/enwiki.articletopic.gradient_boosting.model \
2016
models/kowiki.articletopic.gradient_boosting.model \
2117
models/viwiki.articletopic.gradient_boosting.model \
22-
models/wikidata.articletopic.gradient_boosting.model
18+
models/wikidata.articletopic.gradient_boosting.model \
19+
models/hywiki.articletopic.gradient_boosting.model
2320

21+
tuning_models: \
22+
tuning_reports/hywiki.articletopic.md \
23+
tuning_reports/hywiki.drafttopic.md
2424

25-
# Johnson, Isaac; Halfaker, Aaron (2019):
26-
# Wikipedia Articles and Associated WikiProject Templates. figshare. Dataset.
27-
# https://doi.org/10.6084/m9.figshare.10248344.v3
2825
datasets/enwiki.article_items_with_wikiproject_templates.20191201.json.bz2:
2926
wget https://ndownloader.figshare.com/files/20183063 -qO- > $@
3027

@@ -58,163 +55,54 @@ labels-config.json: \
5855
datasets/enwiki.labeled_article_items.json.bz2
5956
bzcat $< | ./utility write_labels taxo_labels > $@
6057

61-
62-
###################### Arabic Wikipedia ##########################
63-
64-
datasets/arwiki.balanced_article_sample.json: \
58+
datasets/hywiki.balanced_article_sample.json: \
6559
datasets/enwiki.labeled_article_items.json.bz2
66-
bzcat $< | ./utility balance_sample ar -n 1000 > $@
60+
bzcat $< | ./utility balance_sample hy -n 1000 > $@
6761

68-
datasets/arwiki.balanced_article_sample.w_draft_text.json: \
69-
datasets/arwiki.balanced_article_sample.json
62+
datasets/hywiki.balanced_article_sample.w_draft_text.json: \
63+
datasets/hywiki.balanced_article_sample.json
7064
./utility fetch_draft_text \
71-
--api-host=https://ar.wikipedia.org \
65+
--api-host=https://hy.wikipedia.org \
7266
--input=$< \
7367
--output=$@ \
7468
--debug
7569

76-
datasets/arwiki.balanced_article_sample.w_article_text.json: \
77-
datasets/arwiki.balanced_article_sample.json
70+
datasets/hywiki.balanced_article_sample.w_article_text.json: \
71+
datasets/hywiki.balanced_article_sample.json
7872
./utility fetch_article_text \
79-
--api-host=https://ar.wikipedia.org \
73+
--api-host=https://hy.wikipedia.org \
8074
--input=$< \
8175
--output=$@ \
8276
--debug
8377

78+
word2vec/hywiki-20200501-learned_vectors.50_cell.10k.kv:
79+
wget https://analytics.wikimedia.org/datasets/archive/public-datasets/all/ores/topic/vectors/hywiki-20200501-learned_vectors.50_cell.10k.kv -qO- > $@
8480

85-
word2vec/arwiki-20200501-learned_vectors.50_cell.10k.kv:
86-
wget https://analytics.wikimedia.org/datasets/archive/public-datasets/all/ores/topic/vectors/arwiki-20200501-learned_vectors.50_cell.10k.kv -qO- > $@
87-
88-
datasets/arwiki.balanced_article_sample.w_draft_cache.json: \
89-
datasets/arwiki.balanced_article_sample.w_draft_text.json \
90-
word2vec/arwiki-20200501-learned_vectors.50_cell.10k.kv
81+
datasets/hywiki.balanced_article_sample.w_draft_cache.json: \
82+
datasets/hywiki.balanced_article_sample.w_draft_text.json \
83+
word2vec/hywiki-20200501-learned_vectors.50_cell.10k.kv
9184
./utility extract_from_text \
92-
drafttopic.feature_lists.arwiki.drafttopic \
85+
drafttopic.feature_lists.hywiki.drafttopic \
9386
--input=$< \
9487
--output=$@ \
9588
--verbose
9689

97-
datasets/arwiki.balanced_article_sample.w_article_cache.json: \
98-
datasets/arwiki.balanced_article_sample.w_article_text.json \
99-
word2vec/arwiki-20200501-learned_vectors.50_cell.10k.kv
90+
datasets/hywiki.balanced_article_sample.w_article_cache.json: \
91+
datasets/hywiki.balanced_article_sample.w_article_text.json \
92+
word2vec/hywiki-20200501-learned_vectors.50_cell.10k.kv
10093
./utility extract_from_text \
101-
drafttopic.feature_lists.arwiki.articletopic \
94+
drafttopic.feature_lists.hywiki.articletopic \
10295
--input=$< \
10396
--output=$@ \
10497
--verbose
10598

106-
models/arwiki.drafttopic.gradient_boosting.model: \
107-
datasets/arwiki.balanced_article_sample.w_draft_cache.json \
108-
labels-config.json
109-
cat $< | \
110-
revscoring cv_train revscoring.scoring.models.GradientBoosting \
111-
drafttopic.feature_lists.arwiki.drafttopic taxo_labels \
112-
--debug \
113-
--labels-config=labels-config.json \
114-
-p 'n_estimators=150' \
115-
-p 'max_depth=5' \
116-
-p 'max_features="log2"' \
117-
-p 'learning_rate=0.1' \
118-
--version=$(drafttopic_major_minor).0 \
119-
--folds=5 \
120-
--multilabel > $@
121-
122-
revscoring model_info $@ > model_info/arwiki.drafttopic.md
123-
124-
models/arwiki.articletopic.gradient_boosting.model: \
125-
datasets/arwiki.balanced_article_sample.w_article_cache.json \
126-
labels-config.json
127-
cat $< | \
128-
revscoring cv_train revscoring.scoring.models.GradientBoosting \
129-
drafttopic.feature_lists.arwiki.articletopic taxo_labels \
130-
--debug \
131-
--labels-config=labels-config.json \
132-
-p 'n_estimators=150' \
133-
-p 'max_depth=5' \
134-
-p 'max_features="log2"' \
135-
-p 'learning_rate=0.1' \
136-
--version=$(drafttopic_major_minor).0 \
137-
--folds=5 \
138-
--multilabel > $@
139-
140-
revscoring model_info $@ > model_info/arwiki.articletopic.md
141-
142-
143-
tuning_reports/arwiki.drafttopic.md: \
144-
datasets/arwiki.balanced_article_sample.w_draft_cache.json
145-
cat $< | \
146-
revscoring tune config/gradient_boosting.params.yaml \
147-
drafttopic.feature_lists.arwiki.drafttopic \
148-
taxo_labels pr_auc.macro \
149-
--debug \
150-
--verbose \
151-
--multilabel \
152-
--labels-config=labels-config.yaml \
153-
--folds=3 > $@
154-
155-
tuning_reports/arwiki.articletopic.md: \
156-
datasets/arwiki.balanced_article_sample.w_article_cache.json
157-
cat $< | \
158-
revscoring tune config/gradient_boosting.params.yaml \
159-
drafttopic.feature_lists.arwiki.articletopic \
160-
taxo_labels pr_auc.macro \
161-
--debug \
162-
--verbose \
163-
--multilabel \
164-
--labels-config=labels-config.yaml \
165-
--folds=3 > $@
166-
167-
168-
################### Czech Wikipedia #############################
169-
170-
datasets/cswiki.balanced_article_sample.json: \
171-
datasets/enwiki.labeled_article_items.json.bz2
172-
bzcat $< | ./utility balance_sample cs -n 1000 > $@
173-
174-
datasets/cswiki.balanced_article_sample.w_draft_text.json: \
175-
datasets/cswiki.balanced_article_sample.json
176-
./utility fetch_draft_text \
177-
--api-host=https://cs.wikipedia.org \
178-
--input=$< \
179-
--output=$@ \
180-
--debug
181-
182-
datasets/cswiki.balanced_article_sample.w_article_text.json: \
183-
datasets/cswiki.balanced_article_sample.json
184-
./utility fetch_article_text \
185-
--api-host=https://cs.wikipedia.org \
186-
--input=$< \
187-
--output=$@ \
188-
--debug
189-
190-
191-
word2vec/cswiki-20200501-learned_vectors.50_cell.10k.kv:
192-
wget https://analytics.wikimedia.org/datasets/archive/public-datasets/all/ores/topic/vectors/cswiki-20200501-learned_vectors.50_cell.10k.kv -qO- > $@
193-
194-
datasets/cswiki.balanced_article_sample.w_draft_cache.json: \
195-
datasets/cswiki.balanced_article_sample.w_draft_text.json \
196-
word2vec/cswiki-20200501-learned_vectors.50_cell.10k.kv
197-
./utility extract_from_text \
198-
drafttopic.feature_lists.cswiki.drafttopic \
199-
--input=$< \
200-
--output=$@ \
201-
--verbose
202-
203-
datasets/cswiki.balanced_article_sample.w_article_cache.json: \
204-
datasets/cswiki.balanced_article_sample.w_article_text.json \
205-
word2vec/cswiki-20200501-learned_vectors.50_cell.10k.kv
206-
./utility extract_from_text \
207-
drafttopic.feature_lists.cswiki.articletopic \
208-
--input=$< \
209-
--output=$@ \
210-
--verbose
21199

212-
models/cswiki.drafttopic.gradient_boosting.model: \
213-
datasets/cswiki.balanced_article_sample.w_draft_cache.json \
100+
models/hywiki.drafttopic.gradient_boosting.model: \
101+
datasets/hywiki.balanced_article_sample.w_draft_cache.json \
214102
labels-config.json
215103
cat $< | \
216104
revscoring cv_train revscoring.scoring.models.GradientBoosting \
217-
drafttopic.feature_lists.cswiki.drafttopic taxo_labels \
105+
drafttopic.feature_lists.hywiki.drafttopic taxo_labels \
218106
--debug \
219107
--labels-config=labels-config.json \
220108
-p 'n_estimators=150' \
@@ -225,14 +113,14 @@ models/cswiki.drafttopic.gradient_boosting.model: \
225113
--folds=5 \
226114
--multilabel > $@
227115

228-
revscoring model_info $@ > model_info/cswiki.drafttopic.md
116+
revscoring model_info $@ > model_info/hywiki.drafttopic.md
229117

230-
models/cswiki.articletopic.gradient_boosting.model: \
231-
datasets/cswiki.balanced_article_sample.w_article_cache.json \
118+
models/hywiki.articletopic.gradient_boosting.model: \
119+
datasets/hywiki.balanced_article_sample.w_article_cache.json \
232120
labels-config.json
233121
cat $< | \
234122
revscoring cv_train revscoring.scoring.models.GradientBoosting \
235-
drafttopic.feature_lists.cswiki.articletopic taxo_labels \
123+
drafttopic.feature_lists.hywiki.articletopic taxo_labels \
236124
--debug \
237125
--labels-config=labels-config.json \
238126
-p 'n_estimators=150' \
@@ -243,26 +131,26 @@ models/cswiki.articletopic.gradient_boosting.model: \
243131
--folds=5 \
244132
--multilabel > $@
245133

246-
revscoring model_info $@ > model_info/cswiki.articletopic.md
134+
revscoring model_info $@ > model_info/hywiki.articletopic.md
247135

248136

249-
tuning_reports/cswiki.drafttopic.md: \
250-
datasets/cswiki.balanced_article_sample.w_draft_cache.json
137+
tuning_reports/hywiki.drafttopic.md: \
138+
datasets/hywiki.balanced_article_sample.w_draft_cache.json
251139
cat $< | \
252140
revscoring tune config/gradient_boosting.params.yaml \
253-
drafttopic.feature_lists.cswiki.drafttopic \
141+
drafttopic.feature_lists.hywiki.drafttopic \
254142
taxo_labels pr_auc.macro \
255143
--debug \
256144
--verbose \
257145
--multilabel \
258146
--labels-config=labels-config.json \
259147
--folds=3 > $@
260148

261-
tuning_reports/cswiki.articletopic.md: \
262-
datasets/cswiki.balanced_article_sample.w_article_cache.json
149+
tuning_reports/hywiki.articletopic.md: \
150+
datasets/hywiki.balanced_article_sample.w_article_cache.json
263151
cat $< | \
264152
revscoring tune config/gradient_boosting.params.yaml \
265-
drafttopic.feature_lists.cswiki.articletopic \
153+
drafttopic.feature_lists.hywiki.articletopic \
266154
taxo_labels pr_auc.macro \
267155
--debug \
268156
--verbose \

docs/reference/drafttopic.feature_lists.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ drafttopic.feature\_lists.kowiki module
3636
:undoc-members:
3737
:show-inheritance:
3838

39+
drafttopic.feature\_lists.ukwiki module
40+
---------------------------------------
41+
42+
.. automodule:: drafttopic.feature_lists.ukwiki
43+
:members:
44+
:undoc-members:
45+
:show-inheritance:
46+
3947
drafttopic.feature\_lists.viwiki module
4048
---------------------------------------
4149

drafttopic/feature_lists/euwiki.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from revscoring.datasources.meta import vectorizers, mappers
2+
from revscoring.features import wikitext
3+
from revscoring.features.meta import aggregators
4+
5+
6+
euwiki_kvs = vectorizers.word2vec.load_gensim_kv(
7+
filename="euwiki-20200501-learned_vectors.50_cell.10k.kv", mmap='r')
8+
9+
10+
def vectorize_words(words):
11+
return vectorizers.word2vec.vectorize_words(euwiki_kvs, words)
12+
13+
14+
revision_text_vectors = vectorizers.word2vec(
15+
mappers.lower_case(wikitext.revision.datasources.words),
16+
vectorize_words,
17+
name="revision.text.eu_vectors")
18+
19+
w2v = aggregators.mean(
20+
revision_text_vectors,
21+
vector=True,
22+
name="revision.text.eu_vectors_mean"
23+
)
24+
25+
drafttopic = [w2v]
26+
articletopic = drafttopic

drafttopic/feature_lists/huwiki.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from revscoring.datasources.meta import vectorizers, mappers
2+
from revscoring.features import wikitext
3+
from revscoring.features.meta import aggregators
4+
5+
6+
huwiki_kvs = vectorizers.word2vec.load_gensim_kv(
7+
filename="huwiki-20200501-learned_vectors.50_cell.10k.kv", mmap='r')
8+
9+
10+
def vectorize_words(words):
11+
return vectorizers.word2vec.vectorize_words(huwiki_kvs, words)
12+
13+
14+
revision_text_vectors = vectorizers.word2vec(
15+
mappers.lower_case(wikitext.revision.datasources.words),
16+
vectorize_words,
17+
name="revision.text.hu_vectors")
18+
19+
w2v = aggregators.mean(
20+
revision_text_vectors,
21+
vector=True,
22+
name="revision.text.hu_vectors_mean"
23+
)
24+
25+
drafttopic = [w2v]
26+
articletopic = drafttopic

drafttopic/feature_lists/hywiki.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from revscoring.datasources.meta import vectorizers, mappers
2+
from revscoring.features import wikitext
3+
from revscoring.features.meta import aggregators
4+
5+
6+
hywiki_kvs = vectorizers.word2vec.load_gensim_kv(
7+
filename="hywiki-20200501-learned_vectors.50_cell.10k.kv", mmap='r')
8+
9+
10+
def vectorize_words(words):
11+
return vectorizers.word2vec.vectorize_words(hywiki_kvs, words)
12+
13+
14+
revision_text_vectors = vectorizers.word2vec(
15+
mappers.lower_case(wikitext.revision.datasources.words),
16+
vectorize_words,
17+
name="revision.text.hy_vectors")
18+
19+
w2v = aggregators.mean(
20+
revision_text_vectors,
21+
vector=True,
22+
name="revision.text.hy_vectors_mean"
23+
)
24+
25+
drafttopic = [w2v]
26+
articletopic = drafttopic

drafttopic/feature_lists/srwiki.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from revscoring.datasources.meta import vectorizers, mappers
2+
from revscoring.features import wikitext
3+
from revscoring.features.meta import aggregators
4+
5+
6+
srwiki_kvs = vectorizers.word2vec.load_gensim_kv(
7+
filename="srwiki-20200501-learned_vectors.50_cell.10k.kv", mmap='r')
8+
9+
10+
def vectorize_words(words):
11+
return vectorizers.word2vec.vectorize_words(srwiki_kvs, words)
12+
13+
14+
revision_text_vectors = vectorizers.word2vec(
15+
mappers.lower_case(wikitext.revision.datasources.words),
16+
vectorize_words,
17+
name="revision.text.sr_vectors")
18+
19+
w2v = aggregators.mean(
20+
revision_text_vectors,
21+
vector=True,
22+
name="revision.text.sr_vectors_mean"
23+
)
24+
25+
drafttopic = [w2v]
26+
articletopic = drafttopic

0 commit comments

Comments
 (0)