Hi, everyone. As you know, I was studying web crawling, which has a different format for each homepage The cafe is different, so I'm asking you this question because there was an error while looking for examples and applying them to other cafes. Here's the code.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
driver = webdriver.Chrome(executable_path = "/Users/GB-OFC-006/Downloads/python/chromedriver.exe")
driver.implicitly_wait(3)
total_list = ["Horse Head", "Title", "Content"]
f = open('preg_quest.csv', 'w', encoding = 'ansi', newline='')
wr = csv.writer(f)
wr.writerow([total_list[0], total_list[1], total_list[2]])
f.close()
driver.get('https://nid.naver.com/nidlogin.login')
driver.find_element_by_name('id').send_keys('gustj258')
driver.find_element_by_name('pw').send_keys('****')
driver.find_element_by_css_selector('#frmNIDLogin > fieldset > input').click()
Time.sleep(15) #Automatic input protection characters are entered directly
driver.find_element_by_css_selector('#frmNIDLogin > fieldset > input').click()
base_url = 'https://cafe.naver.com/onlysealure/'
cnt = 0 # number of collected data
page = 0 # position of current page
while page < 1000 : # Naver cafe's max page
page = page + 1
quest_urls = []
try :
# # add personal conditions
# # &search.Menu = : Bulletin number (varies for each cafe)
# # &search.page = : Number of page to collect data
# &userDisplay = 50: Number of posts to be displayed per page
driver.get(base_url + '&search.menuid=392&search.page='+ str(page) +'&userDisplay=50')
Switch frames to driver.switch_to.frame('cafe_main') #ifframe
quest_list = driver.find_elements_by_css_selector ('div.inner_list > a.article') #I heard it differs from cafe to cafe in the tag part...
quest_urls = [ i.get_attribute('href') for i in quest_list ]
print(len(quest_urls))
for quest in quest_urls :
try : # Because the post may have been deleted, try-exception
driver.get(quest)
driver.switch_to.frame('cafe_main')
soup = bs(driver.page_source, 'html.parser')
#Title extraction
title = soup.select('div.tit-box span.b')[0].get_text()
#Content extraction
content_tags = soup.select('#tbody')[0].select('p')
content = ' '.join([ tags.get_text() for tags in content_tags ])
#Extracting the horse's head
There are words that don't have a head
tag = soup.select('div.tit-box span.head')[0].get_text()
temp_list = [tag, title, content]
f = open('preg_quest.csv', 'a+', encoding = 'ansi', newline='')
wr = csv.writer(f)
wr.writerow(temp_list)
f.close()
cnt = cnt + 1
EXCEPT : # If you don't have a horse head, next
pass
exception : # Chrome alert window processed
driver.switch_to_alert.accpet()
driver.switch_to_alert
driver.switch_to_alert.accpet()
except :
pass
print([page, cnt]) #page can tell the progress and cnt can tell how many data have been collected
The original was a maternity cafe I applied the sea fishing cafe and tried it
Data was not input and was output to an empty space crying
May I know why?
Help me Coding masters!
Source: https://sueaty.tistory.com/36 I never use it commercially.
crawling python
driver.find_element_by_css_selector('#frmNIDLogin > fieldset > input').click()
I didn't check the DOM in the execution environment, but I think it's automatic input prevention, but I know it doesn't pop up unless you fail to log in several times.
So I think you're hearing that there's no frmNIDLogin > fieldset > input item
I don't think it's going to go smoothly even if you fix this...
I recommend you to take a look at html css roughly and try it slowly, and please crawl according to the robots.txt rules...
© 2024 OneMinuteCode. All rights reserved.