高成峰的作业二

代码


  from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
# browser = webdriver.Firefox()
browser = webdriver.Edge()
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
 #assert 'Yahoo' in browser.title
element = browser.find_element(By.ID, 'input_code')  # Find the search box
element.send_keys('申万宏源' + Keys.RETURN)

element = browser.find_element(By.ID, 'disclosure-table')
innerHTML = element.get_attribute('innerHTML')

f = open('innerHTML.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()

 html = to_pretty('innerHTML.html')
browser.quit()
#解释:运用selenium控制浏览器检索某个上市公司信息,运用id进行元素定位,找到想要的信息。
再写入innerHTML。


import re
import pandas as pd

class DisclosureTable():
  '''
  解析深交所定期报告页搜索表格
  '''
  def __init__(self, innerHTML):
      self.html = innerHTML
      self.prefix = 'https://disc.szse.cn/download'
      self.prefix_href = 'https://www.szse.cn/'
      #
      p_a = re.compile('(.*?)', re.DOTALL)
      p_span = re.compile('(.*?)', re.DOTALL)
      self.get_code = lambda txt: p_a.search(txt).group(1).strip()
      self.get_time = lambda txt: p_span.search(txt).group(1).strip()
      #
      self.txt_to_df()

  def txt_to_df(self):
      # html table text to DataFrame
      html = self.html
      p = re.compile('(.*?)', re.DOTALL)
      trs = p.findall(html)

      p2 = re.compile('(.*?)', re.DOTALL)
      tds = [p2.findall(tr) for tr in trs[1:]]

      df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                         '简称': [td[1] for td in tds],
                         '公告标题': [td[2] for td in tds],
                         '公告时间': [td[3] for td in tds]})
      self.df_txt = df

  def get_link(self, txt):
      p_txt = '(.*?)'
      p = re.compile(p_txt, re.DOTALL)
      matchObj = p.search(txt)
      attachpath = matchObj.group(1).strip()
      href       = matchObj.group(2).strip()
      title      = matchObj.group(3).strip()
      return([attachpath, href, title])

  def get_data(self):
      get_code = self.get_code
      get_time = self.get_time
      get_link = self.get_link
      #
      df = self.df_txt
      codes = [get_code(td) for td in df['证券代码']]
      short_names = [get_code(td) for td in df['简称']]
      ahts = [get_link(td) for td in df['公告标题']]
      times = [get_time(td) for td in df['公告时间']]
      #
      prefix = self.prefix
      prefix_href = self.prefix
      df = pd.DataFrame({'证券代码': codes,
                         '简称': short_names,
                         '公告标题': [aht[2] for aht in ahts],
                         'attachpath': [prefix + aht[0] for aht in ahts],
                         'href': [prefix_href + aht[1] for aht in ahts],
                         '公告时间': times
          })
      self.df_data = df
      return(df)


f = open('innerHTML.html',encoding='utf-8')
html = f.read()
f.close()

dt = DisclosureTable(html)
df = dt.get_data()
df.to_csv('申万宏源年报.csv')
#解释:把信息用源代码筛选后写入csv文件

结果

结果截图 结果截图