-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpage.py
More file actions
165 lines (117 loc) · 4.66 KB
/
Copy pathpage.py
File metadata and controls
165 lines (117 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
### PAGE #############################################################################################
# Code for querying the HTML DOM.
# It wraps BeautifulSoup by Leonard Richardson.
# Author: Tom De Smedt.
# Copyright (c) 2007 by Tom De Smedt.
# See LICENSE.txt for details.
from .BeautifulSoup import BeautifulSoup, Tag
from .url import URLAccumulator, URLParser
from .html import replace_entities, plain
from .cache import Cache
def clear_cache():
Cache("html").clear()
### PAGE ERRORS ######################################################################################
class PageUnicodeError(Exception):
def __str__(self): return str(self.__class__)
class PageParseError(Exception):
def __str__(self): return str(self.__class__)
### PAGE #########@###################################################################################
Tag.find_all = Tag.findAll
class Page(BeautifulSoup, URLAccumulator):
""" DOM tree of a HTML page.
Page is essentially an asynchronous download of a BeautifulSoup page.
It has the following methods:
description() - returns meta description
keywords() - returns meta keywords
links() - by default, returns external links
find(tag, attribute=value) - find the first tag with given attributes
find_all(tag, attribute=value) - find all tags with given attributes
find() and find_all() return objects that have find() and find_all() too.
They're essentially lists of Tag objects.
Alternatively, get tags directly as properties, e.g.
page.body.p - returns a list of all p Tag objects (each has find() and find_all() )
To get attributes from a Tag:
p["id"]
"""
def __init__(self, url, wait=10, asynchronous=False, cached=True):
if cached:
cache = "html"
else:
cache = None
URLAccumulator.__init__(self, url, wait, asynchronous, cache)
def load(self, data):
data = replace_entities(data)
try:
BeautifulSoup.__init__(self, data)
except UnicodeEncodeError:
self.error = PageUnicodeError()
BeautifulSoup.__init__(self, "")
except:
self.error = PageParseError()
BeautifulSoup.__init__(self, "")
def _title(self):
""" Returns the page title.
"""
return self.find("title").string
title = property(_title)
def _description(self):
""" Returns the meta description in the page.
"""
meta = self.find("meta", {"name":"description"})
if isinstance(meta, dict) and \
"content" in meta:
return meta["content"]
else:
return ""
description = property(_description)
def _keywords(self):
""" Returns the meta keywords in the page.
"""
meta = self.find("meta", {"name":"keywords"})
if isinstance(meta, dict) and \
"content" in meta:
keywords = [k.strip() for k in meta["content"].split(",")]
else:
keywords = []
return keywords
keywords = property(_keywords)
def links(self, external=True):
""" Retrieves links in the page.
Returns a list of URL's.
By default, only external URL's are returned.
External URL's starts with http:// and point to another
domain than the domain the page is on.
"""
domain = URLParser(self.url).domain
links = []
for a in self("a"):
for attribute, value in a.attrs:
if attribute == "href":
if not external \
or (value.startswith("http://") and value.find("http://"+domain) < 0):
links.append(value)
return links
def find_class(self, classname, tag=""):
return self( tag, {"class": classname} )
def parse(url, wait=10, asynchronous=False, cached=True):
return Page(url, wait, asynchronous, cached)
"""
import url
url = url.create("http://nodebox.net/code/index.php/Share")
url.query["p"] = 2
print url
page = parse(url)
print page.title
print page.title.string
print page.description()
print page.keywords()
print page.find(id="content")["id"]
# find() returns a list of Tags and has a find() method
for p in page.body.find("div", id="content").find_all("p"):
print ">>>", plain(p)
print page.links()
print page.find_all("h2")
print page.contents[0].name
# .div returns a list of Tags
print page.body.div(id="content")[0].p
"""