-
-
Notifications
You must be signed in to change notification settings - Fork 585
Open
Description
import cloudscraper
from bs4 import BeautifulSoup
url = 'https://www.tripadvisor.co.uk/Hotel_Feature-g297414-d505948-zft9166-Shantou_International_Hotel.html%0A'
# Use cloudscraper to bypass Cloudflare protection
try:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=30)
print(f"Status code: {response.status_code}")
response.raise_for_status()
except Exception as e:
print(f"Error: {e}")
# exit(1)
if 'text/html' not in response.headers.get('content-type', ''):
print(f"Skipped non-HTML content: {url}")
html = response.text
soup = BeautifulSoup(html, "lxml")
for script in soup(["script", "style"]):
script.decompose()
raw_content = soup.get_text(separator='\n', strip=True)
print(raw_content)
Output:
Status code: 403
Error: 403 Client Error: Forbidden for url: https://www.tripadvisor.co.uk/Hotel_Feature-g297414-d505948-zft9166-Shantou_International_Hotel.html%0A
tripadvisor.co.uk
Please enable JS and disable any ad blocker
Metadata
Metadata
Assignees
Labels
No labels