钟鸣宇的实验报告
第一部分:获取公司年报并保存为csv文件
代码
import fitz
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
doc = fitz.open('通用设备制造业.pdf')
p = re.compile('(\d+)\n\*?(.*)')
result = []
for page in doc.pages(83,85):
txt = page.get_text()
result = result+p.findall(txt)
result = sorted(set(result), key=result.index)
beg = 0
end = 0
for t in result:
if t[0]=='56':
beg = result.index(t)
elif t[0]=='58':
end = result.index(t)
df = pd.DataFrame({'行业大类代码': result[beg][0],
'行业大类名称': result[beg][1],
'上市公司代码': [t[0] for t in result[beg+1:end]],
'上市公司简称': [t[1] for t in result[beg+1:end]]})
df.to_csv('行业信息.csv')
#爬取深交所上市公司年报链接
browser = webdriver.Edge()
class DisclosureTable_sz():
'''
解析深交所定期报告页搜索表格
'''
def __init__(self, innerHTML):
self.html = innerHTML
self.prefix = 'https://disc.szse.cn/download'
self.prefix_href = 'https://www.szse.cn/'
p_a = re.compile('(.*?)', re.DOTALL)
p_span = re.compile('(.*?)', re.DOTALL)
self.get_code = lambda txt: p_a.search(txt).group(1).strip()
self.get_time = lambda txt: p_span.search(txt).group(1).strip()
self.txt_to_df()
def txt_to_df(self):
html = self.html
p = re.compile('(.*?)', re.DOTALL)
trs = p.findall(html)
p2 = re.compile('(.*?)', re.DOTALL)
tds = [p2.findall(tr) for tr in trs[1:]]
df = pd.DataFrame({'证券代码': [td[0] for td in tds],
'简称': [td[1] for td in tds],
'公告标题': [td[2] for td in tds],
'公告时间': [td[3] for td in tds]})
self.df_txt = df
def get_link(self, txt):
p_txt = '(.*?)'
p = re.compile(p_txt, re.DOTALL)
matchObj = p.search(txt)
attachpath = matchObj.group(1).strip()
href = matchObj.group(2).strip()
title = matchObj.group(3).strip()
return([attachpath, href, title])
def get_data(self):
get_code = self.get_code
get_time = self.get_time
get_link = self.get_link
df = self.df_txt
codes = [get_code(td) for td in df['证券代码']]
short_names = [get_code(td) for td in df['简称']]
ahts = [get_link(td) for td in df['公告标题']]
times = [get_time(td) for td in df['公告时间']]
prefix = self.prefix
prefix_href = self.prefix_href
df = pd.DataFrame({'证券代码': codes,
'简称': short_names,
'公告标题': [aht[2] for aht in ahts],
'attachpath': [prefix + aht[0] for aht in ahts],
'href': [prefix_href + aht[1] for aht in ahts],
'公告时间': times
})
self.df_data = df
return(df)
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
browser.implicitly_wait(8)
for name in df.iloc[0:4,3]:
element = browser.find_element(By.ID, 'input_code')
element.send_keys(name + Keys.RETURN)
browser.find_element(By.LINK_TEXT, '请选择公告类别').click()
browser.find_element(By.LINK_TEXT, '年度报告').click()
y_start = browser.find_element(By.CLASS_NAME, 'input-left')
y_start.send_keys('2012' + Keys.RETURN)
y_end = browser.find_element(By.CLASS_NAME, 'input-right')
y_end.send_keys('2021' + Keys.RETURN)
time.sleep(1)
element = browser.find_element(By.ID, 'disclosure-table')
innerHTML = element.get_attribute('innerHTML')
browser.find_element(By.CSS_SELECTOR, ".btn-clearall").click()
html = innerHTML
dt = DisclosureTable_sz(html)
df1 = dt.get_data()
p = re.compile(".*?(\*).*?")
biaoti = [p.sub("",t) for t in df1['公告标题']]
df1['公告标题'] = biaoti;del p,biaoti
df1.to_csv(name+'.csv')
browser.quit()
#爬取上交所上市公司年报链接
class DisclosureTable_sh():
'''
解析上交所定期报告页搜索表格
'''
def __init__(self, innerHTML):
self.html = innerHTML
self.prefix_href = 'http://www.sse.com.cn/'
p_span = re.compile('(.*?)', re.DOTALL)
self.get_span = lambda txt: p_span.search(txt).group(1).strip()
self.txt_to_df()
def txt_to_df(self):
html = self.html
p = re.compile('(.+?)', re.DOTALL)
trs = p.findall(html)
p2 = re.compile('(.*?)', re.DOTALL)
tds = [p2.findall(tr) for tr in trs[1:]]
df = pd.DataFrame({'证券代码': [td[0] for td in tds],
'简称': [td[1] for td in tds],
'公告标题': [td[2] for td in tds],
'公告时间': [td[3] for td in tds]})
self.df_txt = df
def get_link(self, txt):
p_txt = '(.*?)'
p = re.compile(p_txt, re.DOTALL)
matchObj = p.search(txt)
href = matchObj.group(1).strip()
title = matchObj.group(2).strip()
return([href, title])
def get_data(self):
get_span = self.get_span
get_link = self.get_link
df = self.df_txt
codes = [get_span(td) for td in df['证券代码']]
short_names = [get_span(td) for td in df['简称']]
ahts = [get_link(td) for td in df['公告标题']]
times = [td for td in df['公告时间']]
prefix_href = self.prefix_href
df = pd.DataFrame({'证券代码': codes,
'简称': short_names,
'公告标题': [aht[1] for aht in ahts],
'href': [prefix_href + aht[0] for aht in ahts],
'公告时间': times
})
self.df_data = df
return(df)
def check_nextpage(driver):
try:
driver.find_element(By.LINK_TEXT, '下一页')
return True
except:
return False
browser = webdriver.Edge()
browser.implicitly_wait(5)
i = 4
for code in df.iloc[4:,2]:
browser.get('http://www.sse.com.cn/disclosure/listedinfo/regular/')
time.sleep(1)
element = browser.find_element(By.ID, 'inputCode')
element.send_keys(code)
browser.find_element(By.CSS_SELECTOR, ".sse_outerItem:nth-child(4) .filter-option-inner-inner").click()
browser.find_element(By.LINK_TEXT, "年报").click()
time.sleep(1)
element = browser.find_element(By.CLASS_NAME, 'table-responsive')
innerHTML = element.get_attribute('innerHTML')
html = innerHTML
dt = DisclosureTable_sh(html)
df1 = dt.get_data()
if check_nextpage(browser) == True:
while True:
nextpage = browser.find_element(By.LINK_TEXT,'下一页')
nextpage.click()
time.sleep(1)
element = browser.find_element(By.CLASS_NAME, 'table-responsive')
innerHTML = element.get_attribute('innerHTML')
html = innerHTML
dt = DisclosureTable_sh(html)
df2 = dt.get_data()
df1 = df1.append(df2)
break
df1.reset_index(drop=True,inplace=True)
name = df.iloc[i,3]
p = re.compile(".*?(\*).*?")
biaoti = [p.sub("",t) for t in df1['公告标题']]
df1['公告标题'] = biaoti;del p,biaoti
df1.to_csv(name+'.csv')
i = i+1
browser.quit()
结果
尝试了许多方法我依旧安装不了fitz与pdfplumber所以很遗憾我看不到结果。以下绘图结果为我手动获取数据制作而成。
第二部分:绘图代码
import pandas as pd
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False
df = pd.read_csv('通用设备制造业.csv',index_col=0,dtype=(str))
df_eps = pd.read_csv('通用设备制造业基本每股收益数据.csv',index_col=0)
df_revenue = pd.read_csv('通用设备制造业主营业务收入数据.csv',index_col=0)
df_information = pd.read_csv('通用设备制造业上市公司基本信息.csv',index_col=0,dtype=(str))
df_revenue = df_revenue/100000000
df_revenue.loc['sum'] = df_revenue.sum()
df_revenue = df_revenue.T
df_revenue = df_revenue.sort_values(by='sum',ascending=False,axis=0)
df_revenue = df_revenue.iloc[:10]
top10_list = df_revenue.index.values.tolist()
for i in range(len(top10_list)):
top10_list[i] = top10_list[i][:-6]
df_revenue = df_revenue.T
df_revenue = df_revenue.iloc[:-1]
for name in df['上市公司简称']:
if name not in top10_list:
df_eps.drop(columns=[name+'基本每股收益'], axis=1, inplace=True)
for name in top10_list:
df_revenue.rename(columns={name+'主营业务收入':name}, inplace=True)
df_eps.rename(columns={name+'基本每股收益':name}, inplace=True)
#主营业务收入
plt.figure(figsize=(10,8))
x = df_revenue.index
y_1 = df_revenue.iloc[:,0]
y_2 = df_revenue.iloc[:,1]
y_3 = df_revenue.iloc[:,2]
y_4 = df_revenue.iloc[:,3]
y_5 = df_revenue.iloc[:,4]
plt.plot(x, y_1, marker='^', markersize=8, label=df_revenue.columns[0], linewidth=2.0)
plt.plot(x, y_2, marker='^', markersize=8, label=df_revenue.columns[1], linewidth=2.0)
plt.plot(x, y_3, marker='^', markersize=8, label=df_revenue.columns[2], linewidth=2.0)
plt.plot(x, y_4, marker='^', markersize=8, label=df_revenue.columns[3], linewidth=2.0)
plt.plot(x, y_5, marker='^', markersize=8, label=df_revenue.columns[4], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("主营业务收入(亿元)", fontsize=16)
plt.title("2012-2021年通用设备制造业上市公司主营业务收入随时间变化趋势图", fontsize=16)
plt.legend(loc=1, prop={'size':15})
plt.grid()
#主营业务收入(续)
plt.figure(figsize=(10,8))
y_6 = df_revenue.iloc[:,5]
y_7 = df_revenue.iloc[:,6]
y_8 = df_revenue.iloc[:,7]
y_9 = df_revenue.iloc[:,8]
y_10 = df_revenue.iloc[:,9]
plt.plot(x, y_6, marker='^', markersize=8, label=df_revenue.columns[5], linewidth=2.0)
plt.plot(x, y_7, marker='^', markersize=8, label=df_revenue.columns[6], linewidth=2.0)
plt.plot(x, y_8, marker='^', markersize=8, label=df_revenue.columns[7], linewidth=2.0)
plt.plot(x, y_9, marker='^', markersize=8, label=df_revenue.columns[8], linewidth=2.0)
plt.plot(x, y_10, marker='^', markersize=8, label=df_revenue.columns[9], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("主营业务收入(亿元)", fontsize=16)
plt.title("2012-2021年通用设备制造业上市公司主营业务收入随时间变化趋势图(续)", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()
#基本每股收益
plt.figure(figsize=(10,8))
x = df_eps.index
y_1 = df_eps.iloc[:,4]
y_2 = df_eps.iloc[:,8]
y_3 = df_eps.iloc[:,5]
y_4 = df_eps.iloc[:,6]
y_5 = df_eps.iloc[:,1]
plt.plot(x, y_1, marker='s', markersize=7, label=df_eps.columns[4], linewidth=2.0)
plt.plot(x, y_2, marker='s', markersize=7, label=df_eps.columns[8], linewidth=2.0)
plt.plot(x, y_3, marker='s', markersize=7, label=df_eps.columns[5], linewidth=2.0)
plt.plot(x, y_4, marker='s', markersize=7, label=df_eps.columns[6], linewidth=2.0)
plt.plot(x, y_5, marker='s', markersize=7, label=df_eps.columns[1], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益(元/股)", fontsize=16)
plt.title("2012-2021年通用设备制造业上市公司基本每股收益随时间变化趋势图", fontsize=16)
plt.legend(loc=0, prop={'size':15})
plt.grid()
#基本每股收益(续)
plt.figure(figsize=(10,8))
y_6 = df_eps.iloc[:,9]
y_7 = df_eps.iloc[:,7]
y_8 = df_eps.iloc[:,3]
y_9 = df_eps.iloc[:,2]
y_10 = df_eps.iloc[:,0]
plt.plot(x, y_6, marker='s', markersize=7, label=df_eps.columns[9], linewidth=2.0)
plt.plot(x, y_7, marker='s', markersize=7, label=df_eps.columns[7], linewidth=2.0)
plt.plot(x, y_8, marker='s', markersize=7, label=df_eps.columns[3], linewidth=2.0)
plt.plot(x, y_9, marker='s', markersize=7, label=df_eps.columns[2], linewidth=2.0)
plt.plot(x, y_10, marker='s', markersize=7, label=df_eps.columns[0], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益(元/股)", fontsize=16)
plt.title("2012-2021年通用设备制造业上市公司基本每股收益随时间变化趋势图(续)", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()
绘图结果