forked from cubika/OneCode
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDefault.aspx.vb
More file actions
170 lines (156 loc) · 6.86 KB
/
Default.aspx.vb
File metadata and controls
170 lines (156 loc) · 6.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
'********************************** 模块头 ********************************\
' 模块名: Default.aspx.vb
' 项目名: VBASPNETStripHtmlCode
' 版权 (c) Microsoft Corporation
'
' 这个页面从SourcePage.aspx中搜索完整的html代码.
' 用户可以获取和解析html代码的许多部分,像:纯文本,
' 图片,链接,脚本代码,等等.
' 这个实例代码可以用在许多应用程序中.例如:
' 搜索引擎,搜索引擎需要检查网页的简短信息,像标题,
' 纯文本,图片等等.
'
' This source is subject to the Microsoft Public License.
' See http://www.microsoft.com/opensource/licenses.mspx#Ms-PL.
' All other rights reserved.
'
' THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
' EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED
' WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
'*****************************************************************************/
Imports System.Net
Imports System.IO
Public Class _Default
Inherits System.Web.UI.Page
Dim strUrl As String = [String].Empty
Dim strWholeHtml As String = String.Empty
Const MsgPageRetrieveFailed As String = "对不起,网页运行失败!"
Dim flgPageRetrieved As Boolean = True
Protected Sub Page_Load(ByVal sender As Object, ByVal e As System.EventArgs) Handles Me.Load
strUrl = Me.Page.Request.Url.ToString().Replace("Default", "SourcePage")
tbResult.Text = String.Empty
End Sub
Protected Sub btnRetrieveAll_Click(ByVal sender As Object, ByVal e As EventArgs)
strWholeHtml = Me.GetWholeHtmlCode(strUrl)
If flgPageRetrieved Then
tbResult.Text = strWholeHtml
Else
tbResult.Text = MsgPageRetrieveFailed
End If
End Sub
''' <summary>
''' 用WebRequest和WebRespond从SourcePage.aspx中检索完整的html代码
''' 我们把html代码的格式转换为uft-8.
''' </summary>
''' <param name="url"></param>
''' <returns></returns>
Public Function GetWholeHtmlCode(ByVal url As String) As String
Dim strHtml As String = String.Empty
Dim strReader As StreamReader = Nothing
Dim wrpContent As HttpWebResponse = Nothing
Try
Dim wrqContent As HttpWebRequest = DirectCast(WebRequest.Create(strUrl), HttpWebRequest)
wrqContent.Timeout = 300000
wrpContent = DirectCast(wrqContent.GetResponse(), HttpWebResponse)
If wrpContent.StatusCode <> HttpStatusCode.OK Then
flgPageRetrieved = False
strHtml = MsgPageRetrieveFailed
End If
If wrpContent IsNot Nothing Then
strReader = New StreamReader(wrpContent.GetResponseStream(), Encoding.GetEncoding("utf-8"))
strHtml = strReader.ReadToEnd()
End If
Catch e As Exception
flgPageRetrieved = False
strHtml = e.Message
Finally
If strReader IsNot Nothing Then
strReader.Close()
End If
If wrpContent IsNot Nothing Then
wrpContent.Close()
End If
End Try
Return strHtml
End Function
''' <summary>
''' 从html代码里搜索纯文本,这个纯文本只包括html的
''' Body标记.
''' </summary>
''' <param name="sender"></param>
''' <param name="e"></param>
Protected Sub btnRetrievePureText_Click(ByVal sender As Object, ByVal e As EventArgs)
strWholeHtml = Me.GetWholeHtmlCode(strUrl)
If flgPageRetrieved Then
Dim strRegexScript As String = "(?m)<body[^>]*>(\w|\W)*?</body[^>]*>"
Dim strRegex As String = "<[^>]*>"
Dim strMatchScript As String = String.Empty
Dim matchText As Match = Regex.Match(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase)
strMatchScript = matchText.Groups(0).Value
Dim strPureText As String = Regex.Replace(strMatchScript, strRegex, String.Empty, RegexOptions.IgnoreCase)
tbResult.Text = strPureText
Else
tbResult.Text = MsgPageRetrieveFailed
End If
End Sub
''' <summary>
''' 从html代码中检索脚本代码.
''' </summary>
''' <param name="sender"></param>
''' <param name="e"></param>
Protected Sub btnRetrieveSriptCode_Click(ByVal sender As Object, ByVal e As EventArgs)
strWholeHtml = Me.GetWholeHtmlCode(strUrl)
If flgPageRetrieved Then
Dim strRegexScript As String = "(?m)<script[^>]*>(\w|\W)*?</script[^>]*>"
Dim strRegex As String = "<[^>]*>"
Dim strMatchScript As String = String.Empty
Dim matchList As MatchCollection = Regex.Matches(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase)
Dim strbScriptList As New StringBuilder()
For Each matchSingleScript As Match In matchList
Dim strSingleScriptText As String = Regex.Replace(matchSingleScript.Value, strRegex, String.Empty, RegexOptions.IgnoreCase)
strbScriptList.Append(strSingleScriptText & vbCr & vbLf)
Next
tbResult.Text = strbScriptList.ToString()
Else
tbResult.Text = MsgPageRetrieveFailed
End If
End Sub
''' <summary>
''' 从html代码中检索图片信息.
''' </summary>
''' <param name="sender"></param>
''' <param name="e"></param>
Protected Sub btnRetrieveImage_Click(ByVal sender As Object, ByVal e As EventArgs)
strWholeHtml = Me.GetWholeHtmlCode(strUrl)
If flgPageRetrieved Then
Dim strRegexImg As String = "(?is)<img.*?>"
Dim matchList As MatchCollection = Regex.Matches(strWholeHtml, strRegexImg, RegexOptions.IgnoreCase)
Dim strbImageList As New StringBuilder()
For Each matchSingleImage As Match In matchList
strbImageList.Append(matchSingleImage.Value + vbCr & vbLf)
Next
tbResult.Text = strbImageList.ToString()
Else
tbResult.Text = MsgPageRetrieveFailed
End If
End Sub
''' <summary>
''' 从html代码中检索链接.
''' </summary>
''' <param name="sender"></param>
''' <param name="e"></param>
Protected Sub btnRetrievelink_Click(ByVal sender As Object, ByVal e As EventArgs)
strWholeHtml = Me.GetWholeHtmlCode(strUrl)
If flgPageRetrieved Then
Dim strRegexLink As String = "(?is)<a .*?>"
Dim matchList As MatchCollection = Regex.Matches(strWholeHtml, strRegexLink, RegexOptions.IgnoreCase)
Dim strbLinkList As New StringBuilder()
For Each matchSingleLink As Match In matchList
strbLinkList.Append(matchSingleLink.Value + vbCr & vbLf)
Next
tbResult.Text = strbLinkList.ToString()
Else
tbResult.Text = MsgPageRetrieveFailed
End If
End Sub
End Class