Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions web-scraping/youtube-extractor/extract_video_info.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs
import re
import json

# init session
session = HTMLSession()
Expand Down Expand Up @@ -27,22 +29,30 @@ def get_video_info(url):
result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text
# get the video tags
result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ])
# number of likes
text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])
result["likes"] = 0 if result['likes'] == '' else int(result['likes'])
# number of dislikes
result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
result['dislikes'] = 0 if result['dislikes'] == '' else int(result['dislikes'])

# Additional video and channel information (with help from: https://stackoverflow.com/a/68262735)
data = re.search(r"var ytInitialData = ({.*?});", soup.prettify()).group(1)
data_json = json.loads(data)
videoPrimaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer']
videoSecondaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer']
# number of likes
likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes"
likes_str = likes_label.split(' ')[0].replace(',','')
result["likes"] = '0' if likes_str == 'No' else likes_str
# number of dislikes - YouTube does not publish this anymore...?
# result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
# result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']
result['dislikes'] = 'UNKNOWN'

# channel details
channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
channel_tag = soup.find("meta", itemprop="channelId")['content']
# channel name
channel_name = channel_tag.text
channel_name = soup.find("span", itemprop="author").next.next['content']
# channel URL
channel_url = f"https://www.youtube.com{channel_tag['href']}"
# channel_url = soup.find("span", itemprop="author").next['href']
channel_url = f"https://www.youtube.com{channel_tag}"
# number of subscribers as str
channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label']
result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}
return result

Expand Down