1. I would like to download only the November 7 vegetable price csv file from the following site.
https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do
The site itself can be downloaded by anyone as follows:
2. The following error occurred and the download was not completed.
Self-Solving
elems=browser.find_element(By.XPATH, "//a[@href]")
Change to the following
elems=browser.find_elements(By.XPATH, "//a[@href]")
Error again
3. The following execution code
from selenium import webdriver
importos
import time
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import urllib.request
try:
browser=webdriver.Chrome()
browser.get("https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do")
elements=browser.find_elements(By.XPATH, "//a[@href]")
print(elems)
count = 1
for elemines:
download_url =elem.get_attribute("href")
print(download_url)
urllib.request.urlretrieve(download_url,str(count))
print("Number of download files:" + str(count))
time.sleep(3)
count = count +1
except Exception as ex:
print(ex)
As you can see from the web browser Developer Tools, the link is form submit via JavaScript (HTTP POST), and with that in mind, data can be retrieved in the following ways:
import sys
import re
import requests
from bs4 import BeautifulSoup
import urllib.request
url='https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do'
the_day='20221107'
r=requests.get(f'{url}?s006.dataDate={the_day}')
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'html.parser')
tbl = group.select_one('div.scr1>table')
if tbl is None:
print('not found table', file=sys.stderr')
sys.exit(1)
# city name
cities=[c.text.trip() for cintbl.select('td:nth-child(2)')[1:]]
# Vegetable Data (CSV)
csv_url='https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt005.do'
for i, elmin enumerate(tbl.select('tr td:nth-child(3)a:-soup-contains("CSV")'):
no=re.search(r",\s*'(.+?)'",elm['href']).group(1)
urllib.request.urlretrieve(f'{csv_url}?s004.chohyoKanriNo={no}, f'{(i+1):02d}_{cities[i]}_{the_day}.csv')
After execution
$ls-1*.csv
01_Main Wholesale Market Total_20221107.csv
02_Sendai City_20221107.csv
03_Tokyo_20221107.csv
04_Yokohama City_20221107.csv
05_Kanazawa City_20221107.csv
06_Nagoya City_20221107.csv
07_Kyoto City_20221107.csv
08_Osaka City_20221107.csv
09_Kobe City_20221107.csv
10_Hiroshima City_20221107.csv
11_Takamatsu City_20221107.csv
12_Kitakyushu City_20221107.csv
13_Fukuoka City_20221107.csv
14_Okinawa Prefecture_20221107.csv
15_ Sapporo City_20221107.csv
I don't know why, but according to the URL mentioned in the question, the page is displayed manually by Chrome browser, but it doesn't appear as selenium in my environment (Windows 10 Python 3.11.0).
@cubick-san As the URL pointed out, the work was carried out, so I tried to do the following.
from selenium import webdriver
importos
import time
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import urllib.request
import traceback#### For detailed traceback display
try:
browser=webdriver.Chrome()
# browser.get("https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC001-Evt001.do")### selenium doesn't work
browser.get("https://www.seisen.maff.go.jp/seisen/bs04b040md001/BS04B040UC020SC998-Evt001.do")###@cubick pointed out URL
#### Click on the top left of the list of dates and dates.
browser.find_element(By.XPATH, '//*[@id="s007_Evt001_0"]').click()
time.sleep(2)
#### Retrieve the number of rows containing item titles in the displayed table
elems=len(browser.find_elements(by.XPATH, '//*[@id="content_whitemenu"]/div/table/tbody/tr/td[2]/div[2]/table/tbody/tr')
#### Repeat the second to last line except for the title
for row in range(2,(elems+1)):
#### city name acquisition
cityname=browser.find_element(By.XPATH,f'//*[@id="content_whitemenu"]/div/table/tbody/tr/td[2]/div[2]/table/tbody/tr[{row}]/td[2]').text
#### Click the Vegetable CSV Download Link
browser.find_element(By.XPATH,f'//*[@id="content_whitemenu"]/div/table/tbody/tr/td[2]/div[2]/table/tbody/tr[{row}]/td[3]/a[2]').click()
print(f'download: {cityname}')
time.sleep(3)
browser.quit()#### Stop webdriver when finished
except Exception as ex:
print(traceback.format_exc())###View detailed traceback
© 2024 OneMinuteCode. All rights reserved.