I was running the YouTube comment crawling code using Python. I wanted to scroll down the comments here and collect all the comments, but the scrolling was not going well, so only 20 comments are being collected. And when is the execution successful and when is it not running at all, is there a reason in the code? If anyone knows, I'd appreciate it if you could help me a little.
num_of_end = 4
while num_of_end:
body.send_keys(Keys.END)
time.sleep(2)
num_of_end-= 1
This is the code for scrolling down. And I upload the whole code just in case it helps you solve it.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import string
import time
driver = webdriver.Chrome("C:\\Users\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Python 3.8\\chromedriver")
# URL of the page to load
driver.get('https://www.youtube.com/watch?v=jv1b2c3EYb4')
time.sleep(1)
#Mouse cursor positioning. Position the mouse cursor on the body for scrolling
body = driver.find_element_by_tag_name("body")
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
# Import Title
title = driver.find_element_by_xpath('//*[@id="container"]/h1/yt-formatted-string').text
print("Video Title: " + title + '\n\n')
# Position scroll bar at bottom to collect full comments
#Specify the number of times you want to drop the page
num_of_end = 4
while num_of_end:
body.send_keys(Keys.END)
time.sleep(2)
num_of_end-= 1
# Comments (extracted by html tag)
comment = soup.find_all('yt-formatted-string', {'id':'content-text'})
comment_list = []
for c in comment:
comment_list.append(c.get_text().strip())
# Comment author id
user_id = soup.find_all('a', {'id':'author-text'})
id_list = []
for u in user_id:
id_list.append(u.get_text().strip())
# I like it in the comments Number
like = soup.find_all('span', {'id':'vote-count-left'})
like_list_bf = []
like_list = []
for l in like:
like_list_bf.append(l.get_text().strip())
#Replace empty value with zero
for bf in like_list_bf :
if bf =='' :
like_list.append(0)
else :
like_list.append(bf)
# Check the list of imported comments
for c in comment_list:
print(c + '\n')
# Verify that the size is constant to create a dataframe
print(len(comment_list))
print(len(id_list))
print(len(like_list))
# Pre-processing to prevent breakage when saving as csv (remove emoji-unicode, Arabic characters, etc.)
s_filter = re.compile("[^"
"a-zA-Z" #English
"G-E-G-H-H" #Korean
"0-9" #Number
"\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\=\(\'\"" #Special symbol
"\ "\ " #space
"]+")
# Pre-processing of comments
comment_result = []
for i in comment_list:
i = re.sub(s_filter,"",i)
i = ''.join(i)
comment_result.append(i)
# The ID needs to be preprocessed because the ID can be used with Arabic characters
id_result = []
for i in id_list:
i = re.sub(s_filter,"",i)
i = ''.join(i)
id_result.append(i)
# Create each imported data into a single data frame
DB = pd.DataFrame({'id' : id_result,'comment' : comment_result,'like' : like_list})
# Add comment length for analysis
DB['text_length'] = DB['comment'].apply(len)
# Check DB
DB.head()
# Export to a csv file
DB.to_csv("Dataset Raw27.csv",encoding="euc-kr")
This is the code for scrolling down.
There is no scrolling method in the code you wrote. (Wrong))
# Position the scroll bar at the bottom to collect full comments
If the scroll bar is at the bottom like YouTube, you can't use the static method of loading additional comments by turning the for statement like a code and pressing the end key 4.
Try the following algorithms:
577 PHP ssh2_scp_send fails to send files as intended
885 When building Fast API+Uvicorn environment with PyInstaller, console=False results in an error
606 Uncaught (inpromise) Error on Electron: An object could not be cloned
597 GDB gets version error when attempting to debug with the Presense SDK (IDE)
© 2024 OneMinuteCode. All rights reserved.