@@ -28,17 +28,17 @@ class Document:
2828 TEXT_LENGTH_THRESHOLD = 25
2929 RETRY_LENGTH = 250
3030
31- def __init__ (self , input , ** options ):
32- self .input = inpuunicodear
31+ def __init__ (self , input , notify = None , ** options ):
32+ self .input = input
3333 self .options = defaultdict (lambda : None )
3434 for k , v in options .items ():
3535 self .options [k ] = v
36+ self .notify = notify or logging .info
3637 self .html = None
3738
3839 def _html (self , force = False ):
3940 if force or self .html is None :
40- notify = self .options ['notify' ] or (lambda x : None )
41- self .html = parse (self .input , self .options ['url' ], notify = notify )
41+ self .html = parse (self .input , self .options ['url' ], notify = self .notify )
4242 return self .html
4343
4444 def content (self ):
@@ -48,32 +48,36 @@ def title(self):
4848 return get_title (self ._html ())
4949
5050 def summary (self ):
51- ruthless = True
52- while True :
53- self ._html (True )
54- [i .extract () for i in self .tags (self .html , 'script' , 'style' )]
55-
56- if ruthless : self .remove_unlikely_candidates ()
57- self .transform_misused_divs_into_paragraphs ()
58- candidates = self .score_paragraphs (self .options .get ('min_text_length' , self .TEXT_LENGTH_THRESHOLD ))
59- best_candidate = self .select_best_candidate (candidates )
60- if best_candidate :
61- article = self .get_article (candidates , best_candidate )
62- else :
63- if ruthless :
51+ try :
52+ ruthless = True
53+ while True :
54+ self ._html (True )
55+ [i .extract () for i in self .tags (self .html , 'script' , 'style' )]
56+
57+ if ruthless : self .remove_unlikely_candidates ()
58+ self .transform_misused_divs_into_paragraphs ()
59+ candidates = self .score_paragraphs (self .options .get ('min_text_length' , self .TEXT_LENGTH_THRESHOLD ))
60+ best_candidate = self .select_best_candidate (candidates )
61+ if best_candidate :
62+ article = self .get_article (candidates , best_candidate )
63+ else :
64+ if ruthless :
65+ ruthless = False
66+ # try again
67+ continue
68+ else :
69+ article = self .html .find ('body' ) or self .html
70+
71+ cleaned_article = self .sanitize (article , candidates )
72+ of_acceptable_length = len (cleaned_article or '' ) >= (self .options ['retry_length' ] or self .RETRY_LENGTH )
73+ if ruthless and not of_acceptable_length :
6474 ruthless = False
65- # try again
66- continue
75+ continue # try again
6776 else :
68- article = self .html .find ('body' ) or self .html
69-
70- cleaned_article = self .sanitize (article , candidates )
71- of_acceptable_length = len (cleaned_article or '' ) >= (self .options ['retry_length' ] or self .RETRY_LENGTH )
72- if ruthless and not of_acceptable_length :
73- ruthless = False
74- continue # try again
75- else :
76- return cleaned_article
77+ return cleaned_article
78+ except StandardError , e :
79+ logging .exception ('error getting summary:' )
80+ raise Unparseable (str (e ))
7781
7882 def get_article (self , candidates , best_candidate ):
7983 # Now that we have the top candidate, look through its siblings for content that might also be related.
@@ -322,6 +326,7 @@ def main():
322326 if not (len (args ) == 1 or options .url ):
323327 parser .print_help ()
324328 sys .exit (1 )
329+ logging .basicConfig (level = logging .DEBUG )
325330
326331 file = None
327332 if options .url :
0 commit comments