I want to output the scraping results to Excel.

I would like to categorize the ranking of amebro and scrap the ranking and title until the page is gone, but Excel output is not working well.
I'm a beginner and I'm aware that there are many mistakes, but please let me know.

 options=Options()
options.add_argument('--headless')

# Create a Chrome WebDriver object (in headless mode)
# run without arguments for no-headless
browser=webdriver.Chrome(chrome_options=options)
url="https://blogger.ameba.jp/genres/t_variety/blogs/ranking"
browser.get(url)
time.sleep(2)

def getTitle():
title=browser.find_elements_by_tag_name("h3")
rank=browser.find_elements_by_class_name("c-iconRank__rank")
# Loop

for titles, ranks in zip (title, rank):

el=ranks.text
el2=title.text
# print(el)
# print(el2)
# print("=====================================")
global df
df = pd.DataFrame (columns = ['rank', 'blog name'])
df_add=pd.Series([el,el2]])
df=df.append(df_add, ignore_index=True)
# print(df)

def next_page():
# go to the next page
  btn=browser.find_element_by_css_selector('li.c-pager__item --next>a')
  btn.click()
  time.sleep(1)

def last_page():
  btn=browser.find_element_by_css_selector('li.c-pager__item --next>a')
  if not btn:
    browser.quit()

def start():
  end = 2
  try:
    for page in range (end):
      getTitle()
  next_page()
  last_page()
finally:
  browser.quit()

# until the next page is gone
US>"def start():
  while True: "

start()

df.to_excel('test.xlsx', sheet_name='new_sheet_name', index=False)

python python3 web-scraping selenium

2022-09-30 13:57

1 Answers

The main reason is that df related actions were wrong.
In particular, the method of specifying parameters for pd.Series().The following is true:

df_add=pd.Series ([el,el2], index=df.columns)

The corrections are summarized as follows:

The suggested source does not have an import, so add
chrome_options= is out of date and changed to options=
getTitle() and start() have strange indentation, so fix
df Go global to define and initialize
Move global df to the beginning of getTitle() instead of in the for loop
Modify the above pd.Series() parameters
next_page() and last_page() are integrated into one and return values indicate the presence or absence of the next page (loop continuity or not)

Here are the results of the fix:

import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options=Options()
options.add_argument('--headless')

# Create a Chrome WebDriver object (in headless mode)
# run without arguments for no-headless
browser=webdriver.Chrome(options=options)
url="https://blogger.ameba.jp/genres/t_variety/blogs/ranking"
browser.get(url)
time.sleep(2)
df = pd.DataFrame (columns = ['rank', 'blog name'])

def getTitle():
  global df
  title=browser.find_elements_by_tag_name("h3")
  rank=browser.find_elements_by_class_name("c-iconRank__rank")
  # Loop
  for titles, ranks in zip (title, rank):
    el=ranks.text
    el2=title.text
    # print(el)
    # print(el2)
    # print("=====================================")
    df_add=pd.Series ([el,el2], index=df.columns)
    df=df.append(df_add, ignore_index=True)

def next_page():
# go to the next page
  btn=browser.find_element_by_css_selector('li.c-pager__item --next>a')
  if not btn:
    return False
  btn.click()
  time.sleep(1)
  return True

def start():
  end = 2
  exists = False
  try:
    for page in range (end):
      getTitle()
      exists=next_page()
      if not exists:
        break
  finally:
    browser.quit()

start()

# print(df)
df.to_excel('test.xlsx', sheet_name='new_sheet_name', index=False)

2022-09-30 13:57

If you have any answers or tips

Popular Tags

python x 4647

android x 1593

java x 1494

javascript x 1427

c x 927

c++ x 878

ruby-on-rails x 696

php x 692

python3 x 685

html x 656