-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwikitopics.py
More file actions
49 lines (45 loc) · 1.17 KB
/
wikitopics.py
File metadata and controls
49 lines (45 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
from __future__ import print_function
import numpy as np
import logging
import gensim
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text(
'data/wiki_en_output_wordids.txt')
mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
model = gensim.models.ldamodel.LdaModel(
corpus=mm,
id2word=id2word,
num_topics=100,
update_every=1,
chunksize=10000,
passes=1)
model.save('wiki_lda.pkl')
topics = [model[doc] for doc in mm]
lens = np.array([len(t) for t in topics])
print(np.mean(lens <= 10))
print(np.mean(lens))
counts = np.zeros(100)
for doc_top in topics:
for ti, _ in doc_toc:
counts[ti] += 1
for doc_top in topics:
for ti, _ in doc_top:
counts[ti] += 1
words = model.show_topic(counts.argmax(), 64)
print(words)
print()
print()
print()
words = model.show_topic(counts.argmin(), 64)
print(words)
print()
print()
print()