刘煜晨的作业三

代码


import re
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

os.chdir(r'C:\Users\20279\Desktop\珠海港定期报告')

class DisclosureTable():
    '''
    解析深交所定期报告页搜索表格
    '''
    def __init__(self, innerHTML):
        self.html = innerHTML
        self.prefix = 'https://disc.szse.cn/download'
        self.prefix_href = 'https://www.szse.cn/'
        #
        p_a = re.compile('(.*?)', re.DOTALL)
        p_span = re.compile('(.*?)', re.DOTALL)
        self.get_code = lambda txt: p_a.search(txt).group(1).strip()
        self.get_time = lambda txt: p_span.search(txt).group(1).strip()
        #
        self.txt_to_df()

    def txt_to_df(self):
        # html table text to DataFrame
        html = self.html
        p = re.compile('(.*?)', re.DOTALL)
        trs = p.findall(html)

        p2 = re.compile('(.*?)', re.DOTALL)
        tds = [p2.findall(tr) for tr in trs[1:]]

        df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                           '简称': [td[1] for td in tds],
                           '公告标题': [td[2] for td in tds],
                           '公告时间': [td[3] for td in tds]})
        self.df_txt = df

    def get_link(self, txt):
        p_txt = '(.*?)'
        p = re.compile(p_txt, re.DOTALL)
        matchObj = p.search(txt)
        attachpath = matchObj.group(1).strip()
        href       = matchObj.group(2).strip()
        title      = matchObj.group(3).strip()
        return([attachpath, href, title])

    def get_data(self):
        get_code = self.get_code
        get_time = self.get_time
        get_link = self.get_link
        #
        df = self.df_txt
        codes = [get_code(td) for td in df['证券代码']]
        short_names = [get_code(td) for td in df['简称']]
        ahts = [get_link(td) for td in df['公告标题']]
        times = [get_time(td) for td in df['公告时间']]
        #
        prefix = self.prefix
        prefix_href = self.prefix
        df = pd.DataFrame({'证券代码': codes,
                           '简称': short_names,
                           '公告标题': [aht[2] for aht in ahts],
                           'attachpath': [prefix + aht[0] for aht in ahts],
                           'href': [prefix_href + aht[1] for aht in ahts],
                           '公告时间': times
            })
        self.df_data = df
        return(df)

#打开浏览器并修改下载地址为指定文件夹
driver_url = r"C:\edgedriver\msedgedriver.exe"
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory':r'C:\Users\20279\Desktop\珠海港定期报告'}
options = webdriver.EdgeOptions()
options.add_experimental_option('prefs', prefs)
driver = webdriver.Edge(executable_path=driver_url, options=options)

#导入网址，搜索珠海港并选择定期报告
driver.get('http://www.szse.cn/disclosure/listed/fixed/index.html')

element = driver.find_element(By.ID, 'input_code')
element.send_keys('珠海港' + Keys.RETURN)

driver.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
driver.find_element(By.LINK_TEXT, "年度报告").click()

#获取dsicolure-table代码并解析，获得其中pdf文件的下载路径
element = driver.find_element(By.ID, 'disclosure-table')
innerHTML = element.get_attribute('innerHTML')
f = open('innerHTML_珠海港.html','w',encoding='utf-8')
f.write(innerHTML)
f.close

f = open('innerHTML_珠海港.html',encoding='utf-8')
html = f.read()
f.close()

dt = DisclosureTable(html)
df = dt.get_data()

#在上一步解析出来的dataframe中找到2021年年度报告并下载
for a in range(len(df)):
    if df['公告标题'][a] == '2021年年度报告':
        o = df['attachpath'][a]
        js = "window.open('%s')"%o
        driver.execute_script(js)
结果

解释

先通过selenium打开深交所定期报告页面，搜索珠海港并选择报告类型为年度报告。再获取disclosure-table代码并解析，从而获得该界面的报告下载地址。通过循环进而判断2021年年度报告所对应的下载地址，在浏览器新页面打开即可。