I would like to categorize the ranking of amebro and scrap the ranking and title until the page is gone, but Excel output is not working well.
I'm a beginner and I'm aware that there are many mistakes, but please let me know.
options=Options()
options.add_argument('--headless')
# Create a Chrome WebDriver object (in headless mode)
# run without arguments for no-headless
browser=webdriver.Chrome(chrome_options=options)
url="https://blogger.ameba.jp/genres/t_variety/blogs/ranking"
browser.get(url)
time.sleep(2)
def getTitle():
title=browser.find_elements_by_tag_name("h3")
rank=browser.find_elements_by_class_name("c-iconRank__rank")
# Loop
for titles, ranks in zip (title, rank):
el=ranks.text
el2=title.text
# print(el)
# print(el2)
# print("=====================================")
global df
df = pd.DataFrame (columns = ['rank', 'blog name'])
df_add=pd.Series([el,el2]])
df=df.append(df_add, ignore_index=True)
# print(df)
def next_page():
# go to the next page
btn=browser.find_element_by_css_selector('li.c-pager__item --next>a')
btn.click()
time.sleep(1)
def last_page():
btn=browser.find_element_by_css_selector('li.c-pager__item --next>a')
if not btn:
browser.quit()
def start():
end = 2
try:
for page in range (end):
getTitle()
next_page()
last_page()
finally:
browser.quit()
# until the next page is gone
US>"def start():
while True: "
start()
df.to_excel('test.xlsx', sheet_name='new_sheet_name', index=False)
The main reason is that df
related actions were wrong.
In particular, the method of specifying parameters for pd.Series()
.The following is true:
df_add=pd.Series ([el,el2], index=df.columns)
The corrections are summarized as follows:
chrome_options=
is out of date and changed to options=
getTitle()
and start()
have strange indentation, so fixdf
Go global to define and initializeglobal df
to the beginning of getTitle()
instead of in the for looppd.Series()
parametersnext_page()
and last_page()
are integrated into one and return values indicate the presence or absence of the next page (loop continuity or not)Here are the results of the fix:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options=Options()
options.add_argument('--headless')
# Create a Chrome WebDriver object (in headless mode)
# run without arguments for no-headless
browser=webdriver.Chrome(options=options)
url="https://blogger.ameba.jp/genres/t_variety/blogs/ranking"
browser.get(url)
time.sleep(2)
df = pd.DataFrame (columns = ['rank', 'blog name'])
def getTitle():
global df
title=browser.find_elements_by_tag_name("h3")
rank=browser.find_elements_by_class_name("c-iconRank__rank")
# Loop
for titles, ranks in zip (title, rank):
el=ranks.text
el2=title.text
# print(el)
# print(el2)
# print("=====================================")
df_add=pd.Series ([el,el2], index=df.columns)
df=df.append(df_add, ignore_index=True)
def next_page():
# go to the next page
btn=browser.find_element_by_css_selector('li.c-pager__item --next>a')
if not btn:
return False
btn.click()
time.sleep(1)
return True
def start():
end = 2
exists = False
try:
for page in range (end):
getTitle()
exists=next_page()
if not exists:
break
finally:
browser.quit()
start()
# print(df)
df.to_excel('test.xlsx', sheet_name='new_sheet_name', index=False)
574 Who developed the "avformat-59.dll" that comes with FFmpeg?
572 rails db:create error: Could not find mysql2-0.5.4 in any of the sources
578 Understanding How to Configure Google API Key
581 PHP ssh2_scp_send fails to send files as intended
611 GDB gets version error when attempting to debug with the Presense SDK (IDE)
© 2024 OneMinuteCode. All rights reserved.