Mac OS Sierra
Ver10.12.1
Python 3
The page to retrieve is here.
http://www.keyakizaka46.com/s/k46o/diary/member/list?ima=0000&page=0&rw=20&cd=member&ct=02
You can go around the page just by turning the page.
Here's the current code.
#!/usr/bin/env python3
# -*-coding:utf-8-*-
from bs4 import BeautifulSoup
import sys
from urllib import request
from urllib.parse import urlparse
from urllib.error import URLError, HTTPError
importos.path
importos
import time
import re
# url function for storing destination images
def download(url):
url = url
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0)
Gecko / 20100101 Firefox / 47.0",
}
print(url)
requ = request.Request(url, None, headers)
img=request.urlopen(req)
localfile=open(os.path.basename(url), 'wb')
localfile.write(img.read())
img.close()
localfile.close()
# Save shutterstock image search results
# access destination transition
#par_url='http://www.keyakizaka46.com/s/k46o/diary/member/list?
site=k46o&ima=0000&ct=17'
# imo is the number of pages ct is the number of members
# def page_info(url):
# url=url+"?ima="+str("0".zfill(4)))+
"&page=8&rw=20&cd=member&ct=02"
# return url
defmain():
for i in range (0,100):
try:
url="http://www.keyakizaka46.com/s/k46o/diary/member/list?ima=0000&page={}&rw=20&cd=member&ct=02".format(str(i))
# url access
res=request.urlopen(url)
# Perse with beautifulsoup
soup = BeautifulSoup(res.read(), "html.parser")
# Find the img tag that exists on the page
for link in group.find_all('img'):
# Get Image URL
img_url=link.get('src')
if img_url.netloc=="http://www.keyakizaka46.com" and img_url.path.startswith("/images/"):
print(img_url)
# Download image locally
download(img_url)
time.sleep(10)
except HTTPError as:
print('Error code:', e.code)
except URLError as:
print('Error code:', e.code)
if__name__=='__main__':
main()
When I started this program and left it unattended, I could only save about 400 images in about two and a half hours.
Please let me know what I can do to speed this up.
If you simply measure acceleration, time.sleep(10)
will wait 10 seconds, so if you reduce it, it will be faster.
However, it is important to access it at appropriate intervals, so please be careful.Let's wait at least one second. Hopefully, it's better to have a space when accessing the page.I get the worst complaint.
#url access
res=request.urlopen(url)
# go to a page and leave a space
time.sleep(1)
https://speakerdeck.com/amacbee/pythondezuo-ruwebkuroraru-men
© 2024 OneMinuteCode. All rights reserved.