from bs4 import Beautiful Soup
from urllib.request import urlopen
from html_table_extractor.extractor import Extractor
from selenium import webdriver
import sys
# # -*- encoding: utf-8 -*-
driver = webdriver.Firefox(executable_path='C:/Users/i/Downloads/geckodriver-v0.19.1-win32/geckodriver')
driver.implicitly_wait(1)
driver.get('http://terms.naver.com/list.nhn?cid=58401&categoryId=58401&so=st4.asc&viewType=&categoryType=')
xpath = '//*[@id="content"]/div[4]/ul/ul/li[5]/ul/li['
xpath_bottom = ']/a'
index = 2
while (index <= 26) :
driver.find_element_by_xpath(xpath + str(index) + xpath_bottom).click()
html = driver.page_source.encode('cp949', errors='replace')
soup = BeautifulSoup(html, 'html.parser')
table = soup.select("#size_ct > div.box_tbl > table")
title1 = soup.select("#content > div.section_wrap > div.headword_title > h2")
title2 = str(title1).replace("[", "").replace("]", "").replace("<", "").replace(">", "").replace("/", "").replace("h2", "").replace("class", "").replace("=", "").replace("headword", "").replace("\"", "").lstrip()
stringTable = str(table)
extractor = Extractor(stringTable).parse()
extractor.write_to_csv(title2, path='.')
driver.get('http://terms.naver.com/list.nhn?cid=58401&categoryId=58401&so=st4.asc&viewType=&categoryType=')
index += 1
I'm currently using Python to crawl the web UnicodeEncodeError: 'cp949' codec can't encode character '\xa0' in position 26: illegal multibyte sequence I keep getting encoding errors like this ㅜ<
python crawling encoding error
Python 3.6 (INTEL WIN32) and the browser for selenium used phantomjs.
Output.csv was generated and disease-related information was recorded.
I mean, it worked out well.
607 Uncaught (inpromise) Error on Electron: An object could not be cloned
577 PHP ssh2_scp_send fails to send files as intended
572 Understanding How to Configure Google API Key
567 Who developed the "avformat-59.dll" that comes with FFmpeg?
567 rails db:create error: Could not find mysql2-0.5.4 in any of the sources
© 2024 OneMinuteCode. All rights reserved.