刘辉的作业二

代码


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
browser = webdriver.Edge()  #使用Edge浏览器
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')  #输入网址

#下载数据
element = browser.find_element(By.ID, 'input_code')  #定位搜索框
element.send_keys('国新健康' + Keys.RETURN)  #输入股票名称
element = browser.find_element(By.ID, 'disclosure-table')  #获取披露表,此处应等上述代码运行完成再予运行
innerHTML = element.get_attribute('innerHTML')  #获取html文件



f = open('innerHTML.html','w',encoding='utf-8')  #创建html文件
f.write(innerHTML)
f.close()

#提取数据
import re
import pandas as pd
f = open('innerHTML.html', encoding='utf-8')
html = f.read()
f.close

def txt_to_df(html):
   # html table text to DataFrame
   p = re.compile('(.*?)', re.DOTALL)
   trs = p.findall(html)

   p2 = re.compile('(.*?)', re.DOTALL)
   tds = [p2.findall(tr) for tr in trs[1:]]

   df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                      '简称': [td[1] for td in tds],
                      '公告标题': [td[2] for td in tds],
                      '公告时间': [td[3] for td in tds]})
   return(df)

df_txt = txt_to_df(html)


p_a = re.compile('(.*?)', re.DOTALL)
p_span = re.compile('(.*?)', re.DOTALL)

get_code = lambda txt: p_a.search(txt).group(1).strip()
get_time = lambda txt: p_span.search(txt).group(1).strip()

def get_link(txt):
   p_txt = '(.*?)'
   p = re.compile(p_txt, re.DOTALL)
   matchObj = p.search(txt)
   attachpath = matchObj.group(1).strip()
   href       = matchObj.group(2).strip()
   title      = matchObj.group(3).strip()
   return([attachpath, href, title])

def get_data(df_txt):
   prefix = 'https://disc.szse.cn/download'
   prefix_href = 'https://www.szse.cn/'
   df = df_txt
   codes = [get_code(td) for td in df['证券代码']]
   short_names = [get_code(td) for td in df['简称']]
   ahts = [get_link(td) for td in df['公告标题']]
   times = [get_time(td) for td in df['公告时间']]
   #
   df = pd.DataFrame({'证券代码': codes,
                      '简称': short_names,
                      '公告标题': [aht[2] for aht in ahts],
                      'attachpath': [prefix + aht[0] for aht in ahts],
                      'href': [prefix_href + aht[1] for aht in ahts],
                      '公告时间': times
       })
   return(df)

df_data = get_data(df_txt)

df_data.to_csv('data_国新健康.csv')

结果

结果截图 结果截图