from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from parse_disclosure_table import DisclosureTable
import re
import requests
import pandas as pd
import fitz
import csv
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
browser = webdriver.Edge()
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
#报告类型选择
element = browser.find_element(By.CSS_SELECTOR,"#select_gonggao .glyphicon").click()
element = browser.find_element(By.LINK_TEXT,"年度报告").click()
#日期选择
element = browser.find_element(By.CSS_SELECTOR, ".input-left").click()
element = browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-year span").click()
element = browser.find_element(By.CSS_SELECTOR, ".active li:nth-child(113)").click()
element = browser.find_element(By.LINK_TEXT, "6月").click()
element = browser.find_element(By.CSS_SELECTOR, ".active > .dropdown-menu li:nth-child(1)").click()
element = browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 tr:nth-child(2) > .weekend:nth-child(1) > .tdcontainer").click()
element = browser.find_element(By.CSS_SELECTOR, ".today > .tdcontainer").click()
element = browser.find_element(By.ID, "query-btn").click()
#下载行业分类结果PDF文件
href = 'http://www.csrc.gov.cn/csrc/c100103/c1558619/1558619/files/1638277734844_11692.pdf'
r = requests.get(href,allow_redirects=True)
f = open('2021年3季度上市公司行业分类结果.pdf','wb')
f.write(r.content)
f.close()
r.close()
#获取行业分类结果PDF文件中69类行业所有上市公司
doc = fitz.open('2021年3季度上市公司行业分类结果.pdf')
page1 = doc[19]
page2 = doc[20]
toc_txt1 = page1.get_text()
toc_txt2 = page2.get_text()
r1 = re.compile('(?<=\医药制造业\n)(.*)(?=\n)',re.DOTALL)
txt1 = r1.findall(toc_txt1)
r2 = re.compile('(?<=\医药制造业\n)(.*?)(?=\罗欣药业)',re.DOTALL)
txt2 = r2.findall(toc_txt2)
r = re.compile('(\d{6})\s*(\w+)\s*')
text1 = r.findall(txt1[0])
text2 = r.findall(txt2[0])
firm = text1 + text2
#自动控制浏览器选择所取的公司
for i in range(len(firm)):
name = firm[i][1]
code = firm[i][0]
f = open('inner_HTML_%s.html' %name,'w',encoding='utf-8')
element = browser.find_element(By.ID, "input_code").click()
element = browser.find_element(By.ID,'input_code').send_keys('%s' %code)
time.sleep(0.5)
element = browser.find_element(By.ID, "input_code").send_keys(Keys.ENTER)
element = browser.find_element(By.ID,'disclosure-table')
time.sleep(0.5)
innerHTML = element.get_attribute('innerHTML')
f.write(innerHTML)
time.sleep(0.5)
f.close()
element = browser.find_element(By.CSS_SELECTOR, ".selected-item:nth-child(2) > .icon-remove").click()
time.sleep(0.5)
browser.quit()
#将获取的公司年报地址存入csv文件中
for i in range(len(firm)):
name = firm[i][1]
f = open('inner_HTML_%s.html' %name,encoding='utf-8')
t = f.read()
f.close()
dt = DisclosureTable(t)
df = dt.get_data()
df.to_csv('data_%s.csv' %name)
#去除掉csv文件中的摘要文件链接
lst = {}
df5 = pd.DataFrame(columns = ['股票简称','attachpath'])
df4 = pd.DataFrame(columns = ['股票简称'])
for i in range(len(firm)):
name = firm[i][1]
df1 = pd.DataFrame(columns = ['股票简称','attachpath'])
with open('data_%s.csv' %name,'r',newline='',encoding='utf-8') as csvfile:
csvreader = csv.reader(csvfile)
reader = next(csvreader)
for row in csvreader:
r = re.compile('.*摘要.*',re.DOTALL)
f = r.findall(row[3])
if f == []:
lst1 = {}
lst['股票简称'] = name
lst['attachpath'] = row[4]
lst1['股票简称'] = name
df1 = df1.append(lst,ignore_index=True)
df4 = df4.append(lst1,ignore_index=True)
df5 = df5.append(lst,ignore_index=True)
df4 = df4.drop_duplicates()
#下载获取的pdf文件
for k in range(len(df1[df1['股票简称']=='{}'.format(name)])):
r = requests.get(df1['attachpath'][k],allow_redirects=True)
f = open('{0}_{1}.pdf'.format(df1['股票简称'][k],k),'wb')
f.write(r.content)
f.close()
r.close()
#提取PDF文件中“股票简称”,“股票代码”,“办公地址”,“公司网址”
df2 = pd.DataFrame(columns=['股票简称','股票代码','办公地址','公司网址'])
for x in range(len(df4)):
name = df4['股票简称'][x]
doc = fitz.open('{0}_0.pdf'.format(name))
lst = ['股票简称','股票代码','办公地址','公司[国际互联网]*网址']
pages = {}
lst_text = {}
for i in lst:
try:
p = re.compile(i,re.DOTALL)
page_number = doc.page_count#获取文件页数
#对每一页进行遍历,匹配lst中的每一个元素
for page in range(page_number):
txt = doc[page].get_text()
match = p.findall(txt)
#若匹配到的macth不为空,则提取此时的页码
if len(match) != 0:
pages[i] = page
for k,v in pages.items():
text = doc[v].get_text()
r1 = re.compile('股票简称\s+(.+?)\n',re.DOTALL)
p1 = r1.findall(text)
lst_text['股票简称'] = p1[0]
r2 = re.compile('股票代码\s+(\d+)\s+',re.DOTALL)
p2 = r2.findall(text)
lst_text['股票代码'] = p2[0]
r3 = re.compile('办公地址\s+(.+?)\n',re.DOTALL)
p3 = r3.findall(text)
lst_text['办公地址'] = p3[0]
r4 = re.compile('公司[国际互联网]*网址\s+(.*?.+?)\s+',re.DOTALL)
p4 = r4.findall(text)
lst_text['公司网址'] = p4[0]
except Exception:
print('错误')
df2 = df2.append(lst_text,ignore_index=True)
#提取“主要会计数据和财务指标”中的“营业收入(元)”
r1 = re.compile('\s营业[总]*收入(元)\s*(-?[\d,.]+)\s*',re.DOTALL)
r2 = re.compile('\n(20[\d]{2}\s年)年度报告',re.DOTALL)
r3 = re.compile('\s基本每股收益(元/股)\s*(-?[\d,.]+)\s*',re.DOTALL)
for n in range(len(df4)):
x = df4['股票简称'][n]
data = pd.DataFrame()
for i in range(len(df5[df5['股票简称']=='{}'.format(x)])):
#遍历每一个PDF文件
doc = fitz.open('{0}_{1}.pdf'.format(x,i))
#读取报告年份
f2 = doc[0].get_text()
year = r2.findall(f2)
page_num = doc.page_count
for page in range(page_num):
#匹配营业收入
f1 = doc[page].get_text()
match1 = r1.findall(f1)
if match1 != []:
profit = match1[0]
data1 = pd.DataFrame(profit,index=[x],columns=year)
data = pd.concat([data1,data],join='outer',axis=1)
data.to_csv('{}——营业收入.csv'.format(x),encoding='utf-8')
#提取“基本每股收益(元/股)”
for n in range(len(df4)):
x = df4['股票简称'][n]
data = pd.DataFrame()
for i in range(len(df5[df5['股票简称']=='{}'.format(x)])):
#遍历每一个PDF文件
doc = fitz.open('{0}_{1}.pdf'.format(x,i))
#读取报告年份
f2 = doc[0].get_text()
year = r2.findall(f2)
page_num = doc.page_count
#name = df4.loc[n]
for page in range(page_num):
#匹配营业收入
f1 = doc[page].get_text()
match1 = r3.findall(f1)
if match1 != []:
profit = match1[0]
data1 = pd.DataFrame(profit,index=[x],columns=year)
data = pd.concat([data1,data],join='outer',axis=1)
data.to_csv('{}——每股收益.csv'.format(x),encoding='utf-8')
#绘图
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False
df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))
df_eps = pd.read_csv('基本每股收益.csv',index_col=0)
df_revenue = pd.read_csv('营业收入.csv',index_col=0)
df_information = pd.read_csv('医药制造行业信息.csv',index_col=0,dtype=(str))
df_revenue = df_revenue/100000000
df_revenue.loc['sum'] = df_revenue.sum()
df_revenue = df_revenue.T
df_revenue = df_revenue.sort_values(by='sum',ascending=False,axis=0)
df_revenue = df_revenue.iloc[:10]
top10_list = df_revenue.index.values.tolist()
for i in range(len(top10_list)):
top10_list[i] = top10_list[i][:-6]
df_revenue = df_revenue.T
df_revenue = df_revenue.iloc[:-1]
for name in df['上市公司简称']:
if name not in top10_list:
df_eps.drop(columns=[name+'基本每股收益'], axis=1, inplace=True)
for name in top10_list:
df_revenue.rename(columns={name+'主营业务收入':name}, inplace=True)
df_eps.rename(columns={name+'基本每股收益':name}, inplace=True)
#主营业务收入
plt.figure(figsize=(10,8))
x = df_revenue.index
y_1 = df_revenue.iloc[:,0]
y_2 = df_revenue.iloc[:,1]
y_3 = df_revenue.iloc[:,2]
y_4 = df_revenue.iloc[:,3]
y_5 = df_revenue.iloc[:,4]
plt.plot(x, y_1, marker='^', markersize=8, label=df_revenue.columns[0], linewidth=2.0)
plt.plot(x, y_2, marker='^', markersize=8, label=df_revenue.columns[1], linewidth=2.0)
plt.plot(x, y_3, marker='^', markersize=8, label=df_revenue.columns[2], linewidth=2.0)
plt.plot(x, y_4, marker='^', markersize=8, label=df_revenue.columns[3], linewidth=2.0)
plt.plot(x, y_5, marker='^', markersize=8, label=df_revenue.columns[4], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("营业收入", fontsize=16)
plt.title("营业收入随时间变化趋势图", fontsize=16)
plt.legend(loc=1, prop={'size':15})
plt.grid()
#主营业务收入(续)
plt.figure(figsize=(10,8))
y_6 = df_revenue.iloc[:,5]
y_7 = df_revenue.iloc[:,6]
y_8 = df_revenue.iloc[:,7]
y_9 = df_revenue.iloc[:,8]
y_10 = df_revenue.iloc[:,9]
plt.plot(x, y_6, marker='^', markersize=8, label=df_revenue.columns[5], linewidth=2.0)
plt.plot(x, y_7, marker='^', markersize=8, label=df_revenue.columns[6], linewidth=2.0)
plt.plot(x, y_8, marker='^', markersize=8, label=df_revenue.columns[7], linewidth=2.0)
plt.plot(x, y_9, marker='^', markersize=8, label=df_revenue.columns[8], linewidth=2.0)
plt.plot(x, y_10, marker='^', markersize=8, label=df_revenue.columns[9], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("营业收入", fontsize=16)
plt.title("营业收入随时间变化趋势图(续)", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()
#基本每股收益
plt.figure(figsize=(10,8))
x = df_eps.index
y_1 = df_eps.iloc[:,4]
y_2 = df_eps.iloc[:,8]
y_3 = df_eps.iloc[:,5]
y_4 = df_eps.iloc[:,6]
y_5 = df_eps.iloc[:,1]
plt.plot(x, y_1, marker='s', markersize=7, label=df_eps.columns[4], linewidth=2.0)
plt.plot(x, y_2, marker='s', markersize=7, label=df_eps.columns[8], linewidth=2.0)
plt.plot(x, y_3, marker='s', markersize=7, label=df_eps.columns[5], linewidth=2.0)
plt.plot(x, y_4, marker='s', markersize=7, label=df_eps.columns[6], linewidth=2.0)
plt.plot(x, y_5, marker='s', markersize=7, label=df_eps.columns[1], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益", fontsize=16)
plt.title("基本每股收益随时间变化趋势图", fontsize=16)
plt.legend(loc=0, prop={'size':15})
plt.grid()
#基本每股收益(续)
plt.figure(figsize=(10,8))
y_6 = df_eps.iloc[:,9]
y_7 = df_eps.iloc[:,7]
y_8 = df_eps.iloc[:,3]
y_9 = df_eps.iloc[:,2]
y_10 = df_eps.iloc[:,0]
plt.plot(x, y_6, marker='s', markersize=7, label=df_eps.columns[9], linewidth=2.0)
plt.plot(x, y_7, marker='s', markersize=7, label=df_eps.columns[7], linewidth=2.0)
plt.plot(x, y_8, marker='s', markersize=7, label=df_eps.columns[3], linewidth=2.0)
plt.plot(x, y_9, marker='s', markersize=7, label=df_eps.columns[2], linewidth=2.0)
plt.plot(x, y_10, marker='s', markersize=7, label=df_eps.columns[0], linewidth=2.0)
plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益(元/股)", fontsize=16)
plt.title("基本每股收益随时间变化趋势图", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()
df_revenue[:5].plot(kind='bar', figsize=(10,8), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('营业收入', fontsize=16)
plt.title('营业收入对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()
df_revenue[5:].plot(kind='bar', figsize=(10,8), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('主营业务收入', fontsize=16)
plt.title('营业收入对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()
#2012-2016基本每股收益对比
df_eps.iloc[:5,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('基本每股收益', fontsize=16)
plt.title('基本每股收益对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()
#2017-2021基本每股收益对比
df_eps.iloc[5:,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('基本每股收益(元/股)', fontsize=16)
plt.title('基本每股收益对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14}, ncol=2)
plt.grid()
从所选公司的营业收入趋势来看,虽然中间有着少许起伏波动,但医药制造业的总体发展趋势是向好的,其基本每股收益走向平缓,也说明整个行业受外界影响还是较小的