Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 14 additions & 33 deletions 08_basic_email_web_crawler.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,26 @@
import requests
import re
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin

# regex
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
link_re = re.compile(r'href="(.*?)"')
#get url
#url=input('Enter a URL (include 'http://'):')--this is wrong
url = input('Enter a URL (include `http://`): ')


def crawl(url):
#connect to the url
website=requests.get(url)

result = set()
#read html
html=website.text

req = requests.get(url)

# Check if successful
if(req.status_code != 200):
return []
#use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)

# Find links
links = link_re.findall(req.text)
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)

print("\nFound {} links".format(len(links)))

# Search links for emails
for link in links:
#prints the number of links in the list
print("\nFound {} links".format(len(links)))

# Get an absolute URL for a link
link = urljoin(url, link)

# Find all emails on current page
result.update(email_re.findall(req.text))

return result

if __name__ == '__main__':
emails = crawl('http://www.realpython.com')

print("\nScrapped e-mail addresses:")
for email in emails:
print(email)
print("\n")
for email in emails:
print(email)