99
1010from utils import *
1111from math import log , exp
12- import re , probability , string , search
12+ import heapq , re , search
1313
14- class CountingProbDist ( probability . ProbDist ) :
14+ class CountingProbDist :
1515 """A probability distribution formed by observing and counting examples.
16- If P is an instance of this class and o
17- is an observed value, then there are 3 main operations:
16+ If p is an instance of this class and o is an observed value, then
17+ there are 3 main operations:
1818 p.add(o) increments the count for observation o by 1.
1919 p.sample() returns a random element from the distribution.
2020 p[o] returns the probability for o (as in a regular ProbDist)."""
@@ -23,49 +23,40 @@ def __init__(self, observations=[], default=0):
2323 """Create a distribution, and optionally add in some observations.
2424 By default this is an unsmoothed distribution, but saying default=1,
2525 for example, gives you add-one smoothing."""
26- update (self , dictionary = DefaultDict (default ), needs_recompute = False ,
27- table = [], n_obs = 0 )
26+ update (self , dictionary = {}, n_obs = 0.0 , default = default , sampler = None )
2827 for o in observations :
2928 self .add (o )
3029
3130 def add (self , o ):
32- """Add an observation o to the distribution."""
31+ "Add an observation o to the distribution."
32+ self .smooth_for (o )
3333 self .dictionary [o ] += 1
3434 self .n_obs += 1
35- self .needs_recompute = True
35+ self .sampler = None
3636
37- def sample (self ):
38- """Return a random sample from the distribution."""
39- if self .needs_recompute : self ._recompute ()
40- if self .n_obs == 0 :
41- return None
42- i = bisect .bisect_left (self .table , (1 + random .randrange (self .n_obs ),))
43- (count , o ) = self .table [i ]
44- return o
37+ def smooth_for (self , o ):
38+ """Include o among the possible observations, whether or not
39+ it's been observed yet."""
40+ if o not in self .dictionary :
41+ self .dictionary [o ] = self .default
42+ self .n_obs += self .default
43+ self .sampler = None
4544
4645 def __getitem__ (self , item ):
47- """ Return an estimate of the probability of item."" "
48- if self .needs_recompute : self . _recompute ( )
46+ "Return an estimate of the probability of item."
47+ self .smooth_for ( item )
4948 return self .dictionary [item ] / self .n_obs
5049
51- def __len__ (self ):
52- if self .needs_recompute : self ._recompute ()
53- return self .n_obs
54-
5550 def top (self , n ):
5651 "Return (count, obs) tuples for the n most frequent observations."
57- items = [(v , k ) for (k , v ) in self .dictionary .items ()]
58- items .sort (); items .reverse ()
59- return items [0 :n ]
60-
61- def _recompute (self ):
62- """Recompute the total count n_obs and the table of entries."""
63- n_obs = 0
64- table = []
65- for (o , count ) in self .dictionary .items ():
66- n_obs += count
67- table .append ((n_obs , o ))
68- update (self , n_obs = float (n_obs ), table = table , needs_recompute = False )
52+ return heapq .nlargest (n , [(v , k ) for (k , v ) in self .dictionary .items ()])
53+
54+ def sample (self ):
55+ "Return a random sample from the distribution."
56+ if self .sampler is None :
57+ self .sampler = weighted_sampler (self .dictionary .keys (),
58+ self .dictionary .values ())
59+ return self .sampler ()
6960
7061#______________________________________________________________________________
7162
@@ -81,7 +72,7 @@ def samples(self, n):
8172class NgramTextModel (CountingProbDist ):
8273 """This is a discrete probability distribution over n-tuples of words.
8374 You can add, sample or get P[(word1, ..., wordn)]. The method P.samples(n)
84- builds up an n-word sequence; P.add_text and P.add_sequence add data."""
75+ builds up an n-word sequence; P.add and P.add_sequence add data."""
8576
8677 def __init__ (self , n , observation_sequence = []):
8778 ## In addition to the dictionary of n-tuples, cond_prob is a
@@ -91,7 +82,7 @@ def __init__(self, n, observation_sequence=[]):
9182 self .cond_prob = DefaultDict (CountingProbDist ())
9283 self .add_sequence (observation_sequence )
9384
94- ## sample, __len__, __getitem__ inherited from CountingProbDist
85+ ## sample, __getitem__ inherited from CountingProbDist
9586 ## Note they deal with tuples, not strings, as inputs
9687
9788 def add (self , ngram ):
@@ -113,13 +104,12 @@ def samples(self, nwords):
113104 n = self .n
114105 nminus1gram = ('' ,) * (n - 1 )
115106 output = []
116- while len (output ) < nwords :
107+ for i in range (nwords ):
108+ if nminus1gram not in self .cond_prob :
109+ nminus1gram = ('' ,) * (n - 1 ) # Cannot continue, so restart.
117110 wn = self .cond_prob [nminus1gram ].sample ()
118- if wn :
119- output .append (wn )
120- nminus1gram = nminus1gram [1 :] + (wn ,)
121- else : ## Cannot continue, so restart.
122- nminus1gram = ('' ,) * (n - 1 )
111+ output .append (wn )
112+ nminus1gram = nminus1gram [1 :] + (wn ,)
123113 return ' ' .join (output )
124114
125115#______________________________________________________________________________
@@ -404,24 +394,14 @@ def goal_test(self, state):
404394True
405395"""
406396
407- __doc__ += random_tests ("""
397+ __doc__ += ("""
408398## Compare 1-, 2-, and 3-gram word models of the same text.
409399>>> flatland = DataFile("EN-text/flatland.txt").read()
410400>>> wordseq = words(flatland)
411401>>> P1 = UnigramTextModel(wordseq)
412402>>> P2 = NgramTextModel(2, wordseq)
413403>>> P3 = NgramTextModel(3, wordseq)
414404
415- ## Generate random text from the N-gram models
416- >>> P1.samples(20)
417- 'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees'
418-
419- >>> P2.samples(20)
420- 'flatland well then can anything else more into the total destruction and circles teach others confine women must be added'
421-
422- >>> P3.samples(20)
423- 'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
424-
425405## The most frequent entries in each model
426406>>> P1.top(10)
427407[(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')]
@@ -431,6 +411,18 @@ def goal_test(self, state):
431411
432412>>> P3.top(10)
433413[(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as')), (12, ('of', 'the', 'circles')), (12, ('of', 'sight', 'recognition')), (11, ('the', 'number', 'of')), (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
414+ """ )
415+
416+ __doc__ += random_tests ("""
417+ ## Generate random text from the N-gram models
418+ >>> P1.samples(20)
419+ 'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees'
420+
421+ >>> P2.samples(20)
422+ 'flatland well then can anything else more into the total destruction and circles teach others confine women must be added'
423+
424+ >>> P3.samples(20)
425+ 'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
434426
435427## Probabilities of some common n-grams
436428>>> P1['the']
0 commit comments