I want to download the csv file scraping from the site.

Asked 1 years ago, Updated 1 years ago, 483 views

1. I would like to download only the November 7 vegetable price csv file from the following site.
https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do

The site itself can be downloaded by anyone as follows:

Enter a description of the image here

2. The following error occurred and the download was not completed.

Enter a description of the image here

Self-Solving

elems=browser.find_element(By.XPATH, "//a[@href]")

Change to the following

elems=browser.find_elements(By.XPATH, "//a[@href]")

Error again

Enter a description of the image here

3. The following execution code

 from selenium import webdriver
importos
import time
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import urllib.request

try:
    browser=webdriver.Chrome()
    browser.get("https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do")
    
    elements=browser.find_elements(By.XPATH, "//a[@href]")
    print(elems)
    count = 1
    
    for elemines:
        download_url =elem.get_attribute("href")
        print(download_url)
        urllib.request.urlretrieve(download_url,str(count))
        print("Number of download files:" + str(count))
        
        time.sleep(3)
        count = count +1

except Exception as ex:
    print(ex)

python web-scraping

2022-11-11 08:08

2 Answers

As you can see from the web browser Developer Tools, the link is form submit via JavaScript (HTTP POST), and with that in mind, data can be retrieved in the following ways:

import sys
import re
import requests
from bs4 import BeautifulSoup
import urllib.request

url='https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do'
the_day='20221107'
r=requests.get(f'{url}?s006.dataDate={the_day}')
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'html.parser')

tbl = group.select_one('div.scr1>table')
if tbl is None:
    print('not found table', file=sys.stderr')
    sys.exit(1)

# city name
cities=[c.text.trip() for cintbl.select('td:nth-child(2)')[1:]]

# Vegetable Data (CSV)
csv_url='https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt005.do'
for i, elmin enumerate(tbl.select('tr td:nth-child(3)a:-soup-contains("CSV")'):
    no=re.search(r",\s*'(.+?)'",elm['href']).group(1)
    urllib.request.urlretrieve(f'{csv_url}?s004.chohyoKanriNo={no}, f'{(i+1):02d}_{cities[i]}_{the_day}.csv')

After execution

$ls-1*.csv
01_Main Wholesale Market Total_20221107.csv
02_Sendai City_20221107.csv
03_Tokyo_20221107.csv
04_Yokohama City_20221107.csv
05_Kanazawa City_20221107.csv
06_Nagoya City_20221107.csv
07_Kyoto City_20221107.csv
08_Osaka City_20221107.csv
09_Kobe City_20221107.csv
10_Hiroshima City_20221107.csv
11_Takamatsu City_20221107.csv
12_Kitakyushu City_20221107.csv
13_Fukuoka City_20221107.csv
14_Okinawa Prefecture_20221107.csv
15_ Sapporo City_20221107.csv


2022-11-11 08:11

I don't know why, but according to the URL mentioned in the question, the page is displayed manually by Chrome browser, but it doesn't appear as selenium in my environment (Windows 10 Python 3.11.0).
@cubick-san As the URL pointed out, the work was carried out, so I tried to do the following.

 from selenium import webdriver
importos
import time
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import urllib.request
import traceback#### For detailed traceback display

try:
    browser=webdriver.Chrome()
    # browser.get("https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do")### selenium doesn't work
    browser.get("https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC998-Evt001.do")###@cubick pointed out URL
    
    #### Click on the top left of the list of dates and dates.
    browser.find_element(By.XPATH, '//*[@id="s007_Evt001_0"]').click()
    time.sleep(2)
    
    #### Retrieve the number of rows containing item titles in the displayed table
    elems=len(browser.find_elements(by.XPATH, '//*[@id="content_whitemenu"]/div/table/tbody/tr/td[2]/div[2]/table/tbody/tr')
    
    #### Repeat the second to last line except for the title
    for row in range(2,(elems+1)):
        #### city name acquisition
        cityname=browser.find_element(By.XPATH,f'//*[@id="content_whitemenu"]/div/table/tbody/tr/td[2]/div[2]/table/tbody/tr[{row}]/td[2]').text
        #### Click the Vegetable CSV Download Link
        browser.find_element(By.XPATH,f'//*[@id="content_whitemenu"]/div/table/tbody/tr/td[2]/div[2]/table/tbody/tr[{row}]/td[3]/a[2]').click()
        print(f'download: {cityname}')
        time.sleep(3)
    
    browser.quit()#### Stop webdriver when finished
except Exception as ex:
    print(traceback.format_exc())###View detailed traceback


2022-11-11 08:12

If you have any answers or tips


© 2024 OneMinuteCode. All rights reserved.