-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_lda.py
More file actions
74 lines (63 loc) · 2.03 KB
/
build_lda.py
File metadata and controls
74 lines (63 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
import nltk.corpus
import milk
import numpy as np
import string
from gensim import corpora, models, similarities
import sklearn.datasets
import nltk.stem
from collections import defaultdict
english_stemmer = nltk.stem.SnowballStemmer('english')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes'])
class DirectText(corpora.textcorpus.TextCorpus):
def get_texts(self):
return self.input
def __len__(self):
return len(self.input)
dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
mlcomp_root='../data')
otexts = dataset.data
texts = dataset.data
texts = [t.decode('utf-8', 'ignore') for t in texts]
texts = [t.split() for t in texts]
texts = [map(lambda w: w.lower(), t) for t in texts]
texts = [filter(lambda s: not len(set("+-.?!()>@012345689") & set(s)), t)
for t in texts]
texts = [filter(lambda s: (len(s) > 3) and (s not in stopwords), t)
for t in texts]
texts = [map(english_stemmer.stem, t) for t in texts]
usage = defaultdict(int)
for t in texts:
for w in set(t):
usage[w] += 1
limit = len(texts) / 10
too_common = [w for w in usage if usage[w] > limit]
too_common = set(too_common)
texts = [filter(lambda s: s not in too_common, t) for t in texts]
corpus = DirectText(texts)
dictionary = corpus.dictionary
try:
dictionary['computer']
except:
pass
model = models.ldamodel.LdaModel(
corpus, num_topics=100, id2word=dictionary.id2token)
thetas = np.zeros((len(texts), 100))
for i, c in enumerate(corpus):
for ti, v in model[c]:
thetas[i, ti] += v
distances = milk.unsupervised.pdist(thetas)
large = distances.max() + 1
for i in xrange(len(distances)):
distances[i, i] = large
print otexts[1]
print
print
print
print otexts[distances[1].argmin()]