I would like to do web scraping on python and print the search words and results to excel.
We envision this form as an example of the output in excel.
But if you run the current python program,
The result is like this, and the output is not as expected.
Below is the code, please let me know if there is any solution.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from urllib import request
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import openpyxl as op
import datetime
import time
defchange_window(browser):
all_handles=set(browser.window_handles)
switch_to=all_handles-set ([browser.current_window_handle])
assertlen(switch_to) == 1
browser.switch_to.window(*switch_to)
defmain():
for i in range (1,9):
wb=op.load_workbook('general name.xlsx')
ws = wb.active
word = ws ['A'+str(i)].value
driver = webdriver.Chrome(r'C:\/chromedriver.exe')
driver.get ("https://www.pmda.go.jp/PmdaSearch/kikiSearch/")
#id Search
elem_search_word=driver.find_element_by_id("txtName")
elem_search_word.send_keys(word)
#name Search
elem_search_btn = driver.find_element_by_name('btnA')
elem_search_btn.click()
change_window(driver)
# print(driver.page_source)
cur_url=driver.current_url
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# print(cur_url)
has_pdf_link = False
print(word)
wb=op.load_workbook('URL_DATA.xlsx')
ws = wb.active
ws['C'+str(i)].value=word
for a_tag in group.find_all('a'):
link_pdf=(urljoin(cur_url, a_tag.get('href'))))
# Extract the PDF with pdf at the end of the sentence from link_PDF
# print(word)
if(not link_pdf.lower().endswith('.pdf')) and('/ResultDataSetPDF/'not in link_pdf):
continue
if('searchhelp'not in link_pdf):
has_pdf_link = True
print(link_pdf)
ws['B'+str(i)].value=link_pdf
if not has_pdf_link:
print('False')
ws['B'+str(i)].value=has_pdf_link
time.sleep(2)
time_data = datetime.datetime.today()
ws['A'+str(i)].value=time_data
#wb=op.load_workbook('URL_DATA.xlsx')
# ws = wb.active
# Fill in the time
#ws['A'+str(i)].value=time_data
# Enter URL
#ws['B'+str(i)].value=link_pdf
# Fill in the general name
#ws['C'+str(i)].value=word
wb.save('URL_DATA.xlsx')
if__name__=="__main__":
main()
I'm sorry.I haven't been able to verify the operation, but
There may be multiple PDF links per page, so
1 Modified to take into account that there are n links per search word.
Below is an excerpt of main()
.
defmain():
# Sometimes there are multiple PDF links on a page.
# Considering that there are n links per search word,
# Independent Line Numbers Available
lidx = 1
for i in range (1,9):
wb=op.load_workbook('general name.xlsx')
ws = wb.active
word = ws ['A'+str(i)].value
driver = webdriver.Chrome(r'C:\/chromedriver.exe')
driver.get ("https://www.pmda.go.jp/PmdaSearch/kikiSearch/")
#id Search
elem_search_word=driver.find_element_by_id("txtName")
elem_search_word.send_keys(word)
#name Search
elem_search_btn = driver.find_element_by_name('btnA')
elem_search_btn.click()
change_window(driver)
# print(driver.page_source)
cur_url=driver.current_url
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# print(cur_url)
has_pdf_link = False
print(word)
wb=op.load_workbook('URL_DATA.xlsx')
ws = wb.active
# Date and time the page was analyzed
time_data = datetime.datetime.today()
# Investigate all a tags
for a_tag in group.find_all('a'):
link_pdf=(urljoin(cur_url, a_tag.get('href'))))
# Extract the PDF with pdf at the end of the sentence from link_PDF
# print(word)
if(not link_pdf.lower().endswith('.pdf')) and('/ResultDataSetPDF/'not in link_pdf):
continue
# Linked
if('searchhelp'not in link_pdf):
print(link_pdf)
ws['A'+str(ridx)].value=time_data
ws['B'+str(ridx)].value=link_pdf
# Record search words only on the first link line
if not has_pdf_link:
ws ['C'+str(ridx)].value=word
has_pdf_link = True
lidx + = 1
if not has_pdf_link:
print('False')
ws['A'+str(ridx)].value=time_data
ws['B'+str(ridx)].value=has_pdf_link
ws ['C'+str(ridx)].value=word
lidx + = 1
# Server load protection??
time.sleep(2)
#wb=op.load_workbook('URL_DATA.xlsx')
# ws = wb.active
# Fill in the time
#ws['A'+str(i)].value=time_data
# Enter URL
#ws['B'+str(i)].value=link_pdf
# Fill in the general name
#ws['C'+str(i)].value=word
wb.save('URL_DATA.xlsx')
© 2025 OneMinuteCode. All rights reserved.