You want to crawl the image url of books and save the image on the Yes24 site.
While curling, how can I download both of them when img src
contains and and ?
Files that used the code below but do not contain .jpg
will be downloaded, and if a file with .jpg
appears, an error will occur and be suspended.
book_img = soup.find_all('em', {'class': 'imgBdr'})
img_url = book_img[0].find('img')['src']
print(img_url)
b_img_url.append(img_url) #Find the image url and add it to the list
if '.jpg' in str(img_url):
img = img_url
else:
img = img_url + '.jpg'
img_name = book_img[0].find('img')['alt']
urllib.request.urlretrieve(img, "yes24_2018,2019/" + img_name.replace("/", ",").replace('"', "'").replace(":", "-").replace(">", ")").replace("<", "(").replace("?", "").Strip() + '.jpg') # Save Image
The error is as follows:
urllib.error.HTTPError: HTTP Error 404: Not Found
import requests
from bs4 import BeautifulSoup as bs
def filesave(url):
try:
urlsplit = url.split('/')[-1]
name = 'C:/Users/User/hi/'+urlsplit
bn = requests.get(url).content
if bn[0:3] != b'\xff\xd8\xff':
print('this file is not JPEG file format')
return 0
else:
if 'jpg' not in urlsplit:
name += '.jpg'
f = open(name,'wb')
f.write(bn)
f.close()
print(f'[!] {name} saved')
return name
except Exception as e:
print(e)
return 0
def imgsrc(url):
s = bs(requests.get(url).text, 'html.parser')
img = s.find('div', {'class':'gd_imgArea'})
if img is not None:
return img.span.em.img['src']
else:
return None
url1 = 'http://www.yes24.com/Product/Goods/58397337'
url2 = 'http://www.yes24.com/Product/Goods/58412700'
filesave(imgsrc(url1))
filesave(imgsrc(url2))
Please keep that in mind You can skip identifying file signature.
© 2024 OneMinuteCode. All rights reserved.