兰淼的作业二

代码


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

browser = webdriver.Edge()
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')

element = browser.find_element(By.ID, 'input_code')
element.send_keys('泛海控股' + Keys.RETURN)
element = browser.find_element(By.ID,'disclosure-table')

element = browser.find_element(By.CSS_SELECTOR,"#select_gonggao .glyphicon").click()
element = browser.find_element(By.LINK_TEXT,"年度报告").click()
element = browser.find_element(By.ID,'disclosure-table')
innerHTML = element.get_attribute('innerHTML')

f = open('innerHTML-泛海控股.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()

browser.quit()


import re
import pandas as pd


class DisclosureTable():
      '''
      解析深交所定期报告页搜索表格
      '''
      def __init__(self, innerHTML):
          self.html = innerHTML
          self.prefix = 'https://disc.szse.cn/download'
          self.prefix_href = 'https://www.szse.cn/'

          #生成一个可提取a和span标签的内容的正则表达式p_a和p_span
          p_a = re.compile('(.*?)', re.DOTALL)
          p_span = re.compile('(.*?)', re.DOTALL)

          #定义一个函数：提取p_a组的内容，并将除去换行符的内容以列表形式返回
          self.get_code = lambda txt: p_a.search(txt).group(1).strip()
          self.get_time = lambda txt: p_span.search(txt).group(1).strip()
          #
          self.txt_to_df()

      def txt_to_df(self):
          # html table text to DataFrame
          html = self.html
          p = re.compile('(.*?)', re.DOTALL)
          trs = p.findall(html)

          p2 = re.compile('(.*?)', re.DOTALL)
          tds = [p2.findall(tr) for tr in trs[1:]]

          df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                             '简称': [td[1] for td in tds],
                             '公告标题': [td[2] for td in tds],
                             '公告时间': [td[3] for td in tds]})
          self.df_txt = df

      def get_link(self, txt):
          #利用正则表达式获取PDF的attachpath,href,title
          p_txt = '(.*?)'
          p = re.compile(p_txt, re.DOTALL)
          matchObj = p.search(txt)
          attachpath = matchObj.group(1).strip()
          href       = matchObj.group(2).strip()
          title      = matchObj.group(3).strip()
          return([attachpath, href, title])

      def get_data(self):
          get_code = self.get_code
          get_time = self.get_time
          get_link = self.get_link
          #
          df = self.df_txt
          codes = [get_code(td) for td in df['证券代码']]
          short_names = [get_code(td) for td in df['简称']]
          ahts = [get_link(td) for td in df['公告标题']]
          times = [get_time(td) for td in df['公告时间']]
          #
          prefix = self.prefix
          prefix_href = self.prefix
          df = pd.DataFrame({'证券代码': codes,
                             '简称': short_names,
                             '公告标题': [aht[2] for aht in ahts],
                             'attachpath': [prefix + aht[0] for aht in ahts],
                             'href': [prefix_href + aht[1] for aht in ahts],
                             '公告时间': times
              })
          self.df_data = df
          return(df)

f = open('innerHTML-泛海控股.html',encoding='utf-8')
html = f.read()
f.close()

dt = DisclosureTable(html)
df = dt.get_data()
df.to_csv('data.csv')

结果

解释

首先利用selenium对浏览器进行控制来检索目的网页的相关数据，利用find_element 定位元素，选取唯一的id来进行元素定位，得到想要获取的公司的相关信息，再利用selenium ide 来知晓年度报告所属源代码，利用CSS-SELECTOR以及LINK_TEXT来获得想要的年度报告，将其源代码写入innerHTML-泛海控股.html中。
定义一个面向对象，从而对所获得的源代码用正则表达式进行筛选，并获取的数据写入一个csv文件。