How to Print Results in Python to Excel

Asked 2 years ago, Updated 2 years ago, 46 views

I would like to do web scraping on python and print the search words and results to excel.

We envision this form as an example of the output in excel.
excel

But if you run the current python program,
excel

The result is like this, and the output is not as expected.
Below is the code, please let me know if there is any solution.

 from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from urllib import request
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import openpyxl as op
import datetime
import time


defchange_window(browser):
    all_handles=set(browser.window_handles)
    switch_to=all_handles-set ([browser.current_window_handle])
    assertlen(switch_to) == 1
    browser.switch_to.window(*switch_to)


defmain():
    for i in range (1,9):
        wb=op.load_workbook('general name.xlsx')
        ws = wb.active
        word = ws ['A'+str(i)].value

        driver = webdriver.Chrome(r'C:\/chromedriver.exe')
        driver.get ("https://www.pmda.go.jp/PmdaSearch/kikiSearch/")
        #id Search
        elem_search_word=driver.find_element_by_id("txtName")
        elem_search_word.send_keys(word)
        #name Search
        elem_search_btn = driver.find_element_by_name('btnA')
        elem_search_btn.click()
        change_window(driver)

        # print(driver.page_source)
        cur_url=driver.current_url
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        # print(cur_url)

        has_pdf_link = False
        print(word)

        wb=op.load_workbook('URL_DATA.xlsx')
        ws = wb.active
        ws['C'+str(i)].value=word

        for a_tag in group.find_all('a'):
            link_pdf=(urljoin(cur_url, a_tag.get('href'))))
            # Extract the PDF with pdf at the end of the sentence from link_PDF
            # print(word)

            if(not link_pdf.lower().endswith('.pdf')) and('/ResultDataSetPDF/'not in link_pdf):
                continue
            if('searchhelp'not in link_pdf):
                has_pdf_link = True
                print(link_pdf)
                ws['B'+str(i)].value=link_pdf

        if not has_pdf_link:
            print('False')
            ws['B'+str(i)].value=has_pdf_link

            time.sleep(2)
            time_data = datetime.datetime.today()

            ws['A'+str(i)].value=time_data

        #wb=op.load_workbook('URL_DATA.xlsx')
        # ws = wb.active
            # Fill in the time
        #ws['A'+str(i)].value=time_data
        # Enter URL
        #ws['B'+str(i)].value=link_pdf
        # Fill in the general name
        #ws['C'+str(i)].value=word

        wb.save('URL_DATA.xlsx')



if__name__=="__main__":
    main()

python python3

2022-09-30 21:33

1 Answers

I'm sorry.I haven't been able to verify the operation, but
There may be multiple PDF links per page, so
1 Modified to take into account that there are n links per search word.

Below is an excerpt of main().

defmain():
    # Sometimes there are multiple PDF links on a page.
    # Considering that there are n links per search word,
    # Independent Line Numbers Available
    lidx = 1
    for i in range (1,9):
        wb=op.load_workbook('general name.xlsx')
        ws = wb.active
        word = ws ['A'+str(i)].value

        driver = webdriver.Chrome(r'C:\/chromedriver.exe')
        driver.get ("https://www.pmda.go.jp/PmdaSearch/kikiSearch/")
        #id Search
        elem_search_word=driver.find_element_by_id("txtName")
        elem_search_word.send_keys(word)
        #name Search
        elem_search_btn = driver.find_element_by_name('btnA')
        elem_search_btn.click()
        change_window(driver)

        # print(driver.page_source)
        cur_url=driver.current_url
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        # print(cur_url)

        has_pdf_link = False
        print(word)

        wb=op.load_workbook('URL_DATA.xlsx')
        ws = wb.active

        # Date and time the page was analyzed
        time_data = datetime.datetime.today()

        # Investigate all a tags
        for a_tag in group.find_all('a'):
            link_pdf=(urljoin(cur_url, a_tag.get('href'))))
            # Extract the PDF with pdf at the end of the sentence from link_PDF
            # print(word)

            if(not link_pdf.lower().endswith('.pdf')) and('/ResultDataSetPDF/'not in link_pdf):
                continue
            # Linked
            if('searchhelp'not in link_pdf):
                print(link_pdf)
                ws['A'+str(ridx)].value=time_data
                ws['B'+str(ridx)].value=link_pdf
                # Record search words only on the first link line
                if not has_pdf_link:
                    ws ['C'+str(ridx)].value=word
                has_pdf_link = True
                lidx + = 1

        if not has_pdf_link:
            print('False')
            ws['A'+str(ridx)].value=time_data
            ws['B'+str(ridx)].value=has_pdf_link
            ws ['C'+str(ridx)].value=word
            lidx + = 1

        # Server load protection??
        time.sleep(2)

        #wb=op.load_workbook('URL_DATA.xlsx')
        # ws = wb.active
            # Fill in the time
        #ws['A'+str(i)].value=time_data
        # Enter URL
        #ws['B'+str(i)].value=link_pdf
        # Fill in the general name
        #ws['C'+str(i)].value=word

        wb.save('URL_DATA.xlsx')


2022-09-30 21:33

If you have any answers or tips


© 2024 OneMinuteCode. All rights reserved.