11.DELETE_ON_ERROR :
22
3- drafttopic_major_minor = 1.3
3+ drafttopic_major_minor = 1.4
44
55models : \
66 articletopic_models \
77 drafttopic_models
88
99drafttopic_models : \
10- models/arwiki.drafttopic.gradient_boosting.model \
11- models/cswiki.drafttopic.gradient_boosting.model \
12- models/enwiki.drafttopic.gradient_boosting.model \
13- models/kowiki.drafttopic.gradient_boosting.model \
14- models/viwiki.drafttopic.gradient_boosting.model
10+ models/hywiki.drafttopic.gradient_boosting.model
1511
1612articletopic_models : \
1713 models/arwiki.articletopic.gradient_boosting.model \
1814 models/cswiki.articletopic.gradient_boosting.model \
1915 models/enwiki.articletopic.gradient_boosting.model \
2016 models/kowiki.articletopic.gradient_boosting.model \
2117 models/viwiki.articletopic.gradient_boosting.model \
22- models/wikidata.articletopic.gradient_boosting.model
18+ models/wikidata.articletopic.gradient_boosting.model \
19+ models/hywiki.articletopic.gradient_boosting.model
2320
21+ tuning_models : \
22+ tuning_reports/hywiki.articletopic.md \
23+ tuning_reports/hywiki.drafttopic.md
2424
25- # Johnson, Isaac; Halfaker, Aaron (2019):
26- # Wikipedia Articles and Associated WikiProject Templates. figshare. Dataset.
27- # https://doi.org/10.6084/m9.figshare.10248344.v3
2825datasets/enwiki.article_items_with_wikiproject_templates.20191201.json.bz2 :
2926 wget https://ndownloader.figshare.com/files/20183063 -qO- > $@
3027
@@ -58,163 +55,54 @@ labels-config.json: \
5855 datasets/enwiki.labeled_article_items.json.bz2
5956 bzcat $< | ./utility write_labels taxo_labels > $@
6057
61-
62- # ##################### Arabic Wikipedia ##########################
63-
64- datasets/arwiki.balanced_article_sample.json : \
58+ datasets/hywiki.balanced_article_sample.json : \
6559 datasets/enwiki.labeled_article_items.json.bz2
66- bzcat $< | ./utility balance_sample ar -n 1000 > $@
60+ bzcat $< | ./utility balance_sample hy -n 1000 > $@
6761
68- datasets/arwiki .balanced_article_sample.w_draft_text.json : \
69- datasets/arwiki .balanced_article_sample.json
62+ datasets/hywiki .balanced_article_sample.w_draft_text.json : \
63+ datasets/hywiki .balanced_article_sample.json
7064 ./utility fetch_draft_text \
71- --api-host=https://ar .wikipedia.org \
65+ --api-host=https://hy .wikipedia.org \
7266 --input=$< \
7367 --output=$@ \
7468 --debug
7569
76- datasets/arwiki .balanced_article_sample.w_article_text.json : \
77- datasets/arwiki .balanced_article_sample.json
70+ datasets/hywiki .balanced_article_sample.w_article_text.json : \
71+ datasets/hywiki .balanced_article_sample.json
7872 ./utility fetch_article_text \
79- --api-host=https://ar .wikipedia.org \
73+ --api-host=https://hy .wikipedia.org \
8074 --input=$< \
8175 --output=$@ \
8276 --debug
8377
78+ # word2vec/hywiki-20200501-learned_vectors.50_cell.10k.kv:
79+ # wget https://analytics.wikimedia.org/datasets/archive/public-datasets/all/ores/topic/vectors/hywiki-20200501-learned_vectors.50_cell.10k.kv -qO- > $@
8480
85- word2vec/arwiki-20200501-learned_vectors.50_cell.10k.kv :
86- wget https://analytics.wikimedia.org/datasets/archive/public-datasets/all/ores/topic/vectors/arwiki-20200501-learned_vectors.50_cell.10k.kv -qO- > $@
87-
88- datasets/arwiki.balanced_article_sample.w_draft_cache.json : \
89- datasets/arwiki.balanced_article_sample.w_draft_text.json \
90- word2vec/arwiki-20200501-learned_vectors.50_cell.10k.kv
81+ datasets/hywiki.balanced_article_sample.w_draft_cache.json : \
82+ datasets/hywiki.balanced_article_sample.w_draft_text.json \
83+ word2vec/hywiki-20200501-learned_vectors.50_cell.10k.kv
9184 ./utility extract_from_text \
92- drafttopic.feature_lists.arwiki .drafttopic \
85+ drafttopic.feature_lists.hywiki .drafttopic \
9386 --input=$< \
9487 --output=$@ \
9588 --verbose
9689
97- datasets/arwiki .balanced_article_sample.w_article_cache.json : \
98- datasets/arwiki .balanced_article_sample.w_article_text.json \
99- word2vec/arwiki -20200501-learned_vectors.50_cell.10k.kv
90+ datasets/hywiki .balanced_article_sample.w_article_cache.json : \
91+ datasets/hywiki .balanced_article_sample.w_article_text.json \
92+ word2vec/hywiki -20200501-learned_vectors.50_cell.10k.kv
10093 ./utility extract_from_text \
101- drafttopic.feature_lists.arwiki .articletopic \
94+ drafttopic.feature_lists.hywiki .articletopic \
10295 --input=$< \
10396 --output=$@ \
10497 --verbose
10598
106- models/arwiki.drafttopic.gradient_boosting.model : \
107- datasets/arwiki.balanced_article_sample.w_draft_cache.json \
108- labels-config.json
109- cat $< | \
110- revscoring cv_train revscoring.scoring.models.GradientBoosting \
111- drafttopic.feature_lists.arwiki.drafttopic taxo_labels \
112- --debug \
113- --labels-config=labels-config.json \
114- -p ' n_estimators=150' \
115- -p ' max_depth=5' \
116- -p ' max_features="log2"' \
117- -p ' learning_rate=0.1' \
118- --version=$(drafttopic_major_minor ) .0 \
119- --folds=5 \
120- --multilabel > $@
121-
122- revscoring model_info $@ > model_info/arwiki.drafttopic.md
123-
124- models/arwiki.articletopic.gradient_boosting.model : \
125- datasets/arwiki.balanced_article_sample.w_article_cache.json \
126- labels-config.json
127- cat $< | \
128- revscoring cv_train revscoring.scoring.models.GradientBoosting \
129- drafttopic.feature_lists.arwiki.articletopic taxo_labels \
130- --debug \
131- --labels-config=labels-config.json \
132- -p ' n_estimators=150' \
133- -p ' max_depth=5' \
134- -p ' max_features="log2"' \
135- -p ' learning_rate=0.1' \
136- --version=$(drafttopic_major_minor ) .0 \
137- --folds=5 \
138- --multilabel > $@
139-
140- revscoring model_info $@ > model_info/arwiki.articletopic.md
141-
142-
143- tuning_reports/arwiki.drafttopic.md : \
144- datasets/arwiki.balanced_article_sample.w_draft_cache.json
145- cat $< | \
146- revscoring tune config/gradient_boosting.params.yaml \
147- drafttopic.feature_lists.arwiki.drafttopic \
148- taxo_labels pr_auc.macro \
149- --debug \
150- --verbose \
151- --multilabel \
152- --labels-config=labels-config.yaml \
153- --folds=3 > $@
154-
155- tuning_reports/arwiki.articletopic.md : \
156- datasets/arwiki.balanced_article_sample.w_article_cache.json
157- cat $< | \
158- revscoring tune config/gradient_boosting.params.yaml \
159- drafttopic.feature_lists.arwiki.articletopic \
160- taxo_labels pr_auc.macro \
161- --debug \
162- --verbose \
163- --multilabel \
164- --labels-config=labels-config.yaml \
165- --folds=3 > $@
166-
167-
168- # ################## Czech Wikipedia #############################
169-
170- datasets/cswiki.balanced_article_sample.json : \
171- datasets/enwiki.labeled_article_items.json.bz2
172- bzcat $< | ./utility balance_sample cs -n 1000 > $@
173-
174- datasets/cswiki.balanced_article_sample.w_draft_text.json : \
175- datasets/cswiki.balanced_article_sample.json
176- ./utility fetch_draft_text \
177- --api-host=https://cs.wikipedia.org \
178- --input=$< \
179- --output=$@ \
180- --debug
181-
182- datasets/cswiki.balanced_article_sample.w_article_text.json : \
183- datasets/cswiki.balanced_article_sample.json
184- ./utility fetch_article_text \
185- --api-host=https://cs.wikipedia.org \
186- --input=$< \
187- --output=$@ \
188- --debug
189-
190-
191- word2vec/cswiki-20200501-learned_vectors.50_cell.10k.kv :
192- wget https://analytics.wikimedia.org/datasets/archive/public-datasets/all/ores/topic/vectors/cswiki-20200501-learned_vectors.50_cell.10k.kv -qO- > $@
193-
194- datasets/cswiki.balanced_article_sample.w_draft_cache.json : \
195- datasets/cswiki.balanced_article_sample.w_draft_text.json \
196- word2vec/cswiki-20200501-learned_vectors.50_cell.10k.kv
197- ./utility extract_from_text \
198- drafttopic.feature_lists.cswiki.drafttopic \
199- --input=$< \
200- --output=$@ \
201- --verbose
202-
203- datasets/cswiki.balanced_article_sample.w_article_cache.json : \
204- datasets/cswiki.balanced_article_sample.w_article_text.json \
205- word2vec/cswiki-20200501-learned_vectors.50_cell.10k.kv
206- ./utility extract_from_text \
207- drafttopic.feature_lists.cswiki.articletopic \
208- --input=$< \
209- --output=$@ \
210- --verbose
21199
212- models/cswiki .drafttopic.gradient_boosting.model : \
213- datasets/cswiki .balanced_article_sample.w_draft_cache.json \
100+ models/hywiki .drafttopic.gradient_boosting.model : \
101+ datasets/hywiki .balanced_article_sample.w_draft_cache.json \
214102 labels-config.json
215103 cat $< | \
216104 revscoring cv_train revscoring.scoring.models.GradientBoosting \
217- drafttopic.feature_lists.cswiki .drafttopic taxo_labels \
105+ drafttopic.feature_lists.hywiki .drafttopic taxo_labels \
218106 --debug \
219107 --labels-config=labels-config.json \
220108 -p ' n_estimators=150' \
@@ -225,14 +113,14 @@ models/cswiki.drafttopic.gradient_boosting.model: \
225113 --folds=5 \
226114 --multilabel > $@
227115
228- revscoring model_info $@ > model_info/cswiki .drafttopic.md
116+ revscoring model_info $@ > model_info/hywiki .drafttopic.md
229117
230- models/cswiki .articletopic.gradient_boosting.model : \
231- datasets/cswiki .balanced_article_sample.w_article_cache.json \
118+ models/hywiki .articletopic.gradient_boosting.model : \
119+ datasets/hywiki .balanced_article_sample.w_article_cache.json \
232120 labels-config.json
233121 cat $< | \
234122 revscoring cv_train revscoring.scoring.models.GradientBoosting \
235- drafttopic.feature_lists.cswiki .articletopic taxo_labels \
123+ drafttopic.feature_lists.hywiki .articletopic taxo_labels \
236124 --debug \
237125 --labels-config=labels-config.json \
238126 -p ' n_estimators=150' \
@@ -243,26 +131,26 @@ models/cswiki.articletopic.gradient_boosting.model: \
243131 --folds=5 \
244132 --multilabel > $@
245133
246- revscoring model_info $@ > model_info/cswiki .articletopic.md
134+ revscoring model_info $@ > model_info/hywiki .articletopic.md
247135
248136
249- tuning_reports/cswiki .drafttopic.md : \
250- datasets/cswiki .balanced_article_sample.w_draft_cache.json
137+ tuning_reports/hywiki .drafttopic.md : \
138+ datasets/hywiki .balanced_article_sample.w_draft_cache.json
251139 cat $< | \
252140 revscoring tune config/gradient_boosting.params.yaml \
253- drafttopic.feature_lists.cswiki .drafttopic \
141+ drafttopic.feature_lists.hywiki .drafttopic \
254142 taxo_labels pr_auc.macro \
255143 --debug \
256144 --verbose \
257145 --multilabel \
258146 --labels-config=labels-config.json \
259147 --folds=3 > $@
260148
261- tuning_reports/cswiki .articletopic.md : \
262- datasets/cswiki .balanced_article_sample.w_article_cache.json
149+ tuning_reports/hywiki .articletopic.md : \
150+ datasets/hywiki .balanced_article_sample.w_article_cache.json
263151 cat $< | \
264152 revscoring tune config/gradient_boosting.params.yaml \
265- drafttopic.feature_lists.cswiki .articletopic \
153+ drafttopic.feature_lists.hywiki .articletopic \
266154 taxo_labels pr_auc.macro \
267155 --debug \
268156 --verbose \
0 commit comments