I would like to obtain the following two points from the url source code.
The list of results will be empty.
def get_all_reviews(url):
rvw_list_text = [ ]
res=requests.get(url)
soup=bs4.BeautifulSoup(res.text, features='lxml')
for rinsoup.find_all (['dd', 'p']):
if r.name == 'dd' and r.get('class') == 'rpoint-tx-point':
rvw_list_text.append(r.get_text(trip=True))
elif r.name == 'p' and r.get('class') == 'review-tit-article':
rvw_list_text.append(r.get_text(trip=True))
print(rvw_list_text)
url="https://www.xxxx"
rvw_list_text=get_all_reviews(url)
reviews_text=[ ]
for i in range (len(rvw_list_text)) :
if key_word in rvw_list_text[i].text:# If the search word is included
rvw_text=textwrap.fill(rvw_list_text[i].text,80)
reviews_text.append(rvw_text)
Where do you want to get it?
<dd class="rpoint-tx-point">4.50 points</dd>
<p class="review-tx-article">
<span class="review-tit-article">Curriculum</span>5 course is good, but since I'm doing club activities, I think three and a half hours of classes after club activities on Saturday will be a bit long and difficult.</p>
Possibility 1: javascript is used in the page for content display.
Possibility 2: When bs4 gets the element class, it is passed as an array.
Considering two possibilities, I rewritten it using selenium.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import bs4
chrome_options=Options()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
driver=webdriver.Chrome(chrome_options=chrome_options)
def get_all_reviews(url):
rvw_list_text = [ ]
driver.get(url)
soup=bs4.BeautifulSoup(driver.page_source, features="html.parser")
for rinsoup.find_all (['dd', 'p']):
try:
n_cls=r["class"]
print(n_cls)
exceptKeyError:
continue
if'rpoint-tx-point'inn_cls:
rvw_list_text.append(r.get_text(trip=True))
elif'review-tx-article'inn_cls:
rvw_list_text.append(r.get_text(trip=True))
return rvw_list_text
url="http://xxx"
rvw_list_text=get_all_reviews(url)
print(rvw_list_text)
To use chrome in selenium, you must install chrome and then pass the chromedriver that corresponds to the chrome version through PATH.
https://sites.google.com/a/chromium.org/chromedriver/home
© 2024 OneMinuteCode. All rights reserved.