陈智勇的实验报告
代码
Step1 提取对应行业股票代码
import fitz
import re
import pandas as pd
import numpy as np
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import matplotlib.pyplot as plt
os.chdir('C:/Users/Lenovo/OneDrive/桌面/金融数据获取')
from parse_cninfo_table import *
pdf1 = fitz.open('上市公司行业分类.pdf')
text = ''
for page in pdf1:
text += page.get_text()
p1 = re.compile('\n63\n(.*?)\n64', re.DOTALL)
text_ind = re.findall(p1, text)
p2 = re.compile('.*?\n(\d{6})\n.*?')
code = re.findall(p2, text_ind[0])
结果
Step2 下载年报
browser = webdriver.Edge() #使用Edge浏览器
browser.maximize_window()
def get_cninfo(code): #爬取巨潮网年报信息
browser.get('http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh')
browser.find_element(By.CSS_SELECTOR, ".el-autocomplete > .el-input--medium > .el-input__inner").send_keys(code)
time.sleep(2)
browser.find_element(By.CSS_SELECTOR, ".query-btn").send_keys(Keys.DOWN)
browser.find_element(By.CSS_SELECTOR, ".query-btn").send_keys(Keys.ENTER)
time.sleep(2)
browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").click()
time.sleep(0.5)
browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").clear()
browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").send_keys("2013-01-01")
browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").send_keys(Keys.ENTER)
time.sleep(0.1)
browser.find_element(By.CSS_SELECTOR, ".query-btn").click()
time.sleep(2)
element = browser.find_element(By.CLASS_NAME, 'el-table__body')
innerHTML = element.get_attribute('innerHTML')
return innerHTML
def html_to_df(innerHTML): #转换为Dataframe
f = open('innerHTML.html','w',encoding='utf-8') #创建html文件
f.write(innerHTML)
f.close()
f = open('innerHTML.html', encoding="utf-8")
html = f.read()
f.close()
dt = DisclosureTable(html)
df = dt.get_data()
return df
df = pd.DataFrame()
for i in code:
innerHTML = get_cninfo(i)
time.sleep(0.1)
df = df.append(html_to_df(innerHTML))
time.sleep(0.1)
#df.to_csv('list.csv')
#df = pd.read_csv('list.csv')
#df = df.iloc[:,1:]
#df['证券代码'] = df['证券代码'].apply(lambda x:'{:0>6d}'.format(x))
def filter_links(words,df0,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df0['公告标题']])
else:
ls.append([word not in f for f in df0['公告标题']])
index = []
for r in range(len(df0)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df1 = df0[index]
return(df1)
words1 = ["摘要","已取消","英文","印刷","正文"]
list = filter_links(words1,df,include=False) #去除摘要和已取消的报告
fun1 = lambda x: re.sub('(?<=报告).*', '', x)
fun2 = lambda x: re.sub('.*(?=\d{4})', '', x)
list['公告标题'] = list['公告标题'].apply(fun1) #去除“20xx年度报告”前后内容
list['公告标题'] = list['公告标题'].apply(fun2)
#exception = list[~list['公告标题'].str.contains('年年度报告')]
list = list.drop_duplicates(['证券代码','公告标题'], keep='first') #删去重复值,保留最新一项
list['年份'] = [re.search('\d{4}', title).group() for title in list['公告标题']]
list['公告标题'] = list['简称']+list['公告标题']
os.makedirs('files')
os.chdir('C:/Users/Lenovo/OneDrive/桌面/金融数据获取/files')
def get_pdf(r): #构建下载巨潮网报告pdf函数
p_id = re.compile('.*var announcementId = "(.*)";.*var announcementTime = "(.*?)"',re.DOTALL)
contents = r.text
a_id = re.findall(p_id, contents)
new_url = "http://static.cninfo.com.cn/finalpage/" + a_id[0][1] + '/' + a_id[0][0] + ".PDF"
result = requests.get(new_url, allow_redirects=True)
time.sleep(1)
return result
for c in code:
rpts = list[list['证券代码']==c]
for row in range(len(rpts)):
r = requests.get(rpts.iloc[row,3], allow_redirects=True)
time.sleep(0.3)
try:
result = get_pdf(r)
f = open(rpts.iloc[row,2]+'.PDF', 'wb')
f.write(result.content)
f.close()
r.close()
except:
print(rpts.iloc[row,2])
pass
结果
Step3 解析年报数据
#构建获取营业收入和每股收益数据的函数
def get_adata(rpt):
text = ''
for page in rpt:
text += page.get_text()
p_s = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据和\D*?(?=\\n)(.*?)稀', re.DOTALL)
txt = p_s.search(text).group(0) #匹配对应内容
p1 = re.compile('营(.*?)归',re.DOTALL) #匹配年报中3年的营业收入
data = p1.search(txt).group()
data = data.replace('\n', '') #替换掉换行符
p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}') #匹配内容中的数字到小数点后2位
turnover = p_digit.search(data).group()
turnover = turnover.replace(',','') #去掉逗号
p2 = re.compile('基(.*?)稀',re.DOTALL) #匹配年报中3年的基本每股收益
data = p2.search(txt).group()
data = data.replace('\n', '')
pe = p_digit.search(data).group()
return turnover,pe
def get_bdata(rpt):
text = ''
for page in rpt:
text += page.get_text()
p1 = re.compile('(?<=\\n)\w*办公地址:?\s?\n?(.*?)\s?(?=\\n)', re.DOTALL)
infom1 = p1.findall(text)[0]
p2 = re.compile('(?<=\n)公司\w*网\s?址:?\s?\n?(.*?)\s?(?=\n)', re.DOTALL)
infom2 = p2.findall(text)[0]
return infom1,infom2
#获取营业收入和每股收益数据
turnovers = pd.DataFrame(columns=['公司'] + [year for year in range(2012,2022)])
pes = pd.DataFrame(columns=['公司'] + [year for year in range(2012,2022)])
for i in range(len(code)):
firm = list[list['证券代码']==code[i]]
turnovers.loc[i,'公司'] = firm.iloc[0,1]
pes.loc[i,'公司'] = firm.iloc[0,1]
for item in range(len(firm)):
try:
rpt = fitz.open(firm.iloc[item,2]+'.PDF')
turnover, pe = get_adata(rpt)
turnovers[int(firm.iloc[item,-1])][i] = turnover
pes[int(firm.iloc[item,-1])][i] = pe
except:
print(firm.iloc[item,2]+'解析出错')
turnovers_n = turnovers.iloc[:,1:].astype('float')
turnovers_n.index = turnovers['公司']
turnovers_n.to_csv('营业收入汇总.csv')
pes_n = pes.iloc[:,1:].astype('float')
pes_n.index = pes['公司']
#获取公司信息
firm = list[list['证券代码']==code[0]]
rpt = fitz.open(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'.PDF')
info = pd.DataFrame(columns=['股票代码', '股票简称', '办公地址', '公司网址'])
for i in range(len(code)):
firm = list[list['证券代码']==code[i]]
try:
rpt = fitz.open(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'.PDF')
info1,info2 = get_bdata(rpt)
info.loc[i,'股票代码'] = firm.iloc[0,0]
info.loc[i,'股票简称'] = firm.iloc[0,1]
info.loc[i,'办公地址'] = info1
info.loc[i,'公司网址'] = info2
except:
print(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'解析出错')
info.to_csv('公司信息.csv')
结果
Step4 绘制图表并分析
#绘制各家公司2012年-2021年营业收入变化趋势图表
plt.rcParams['font.sans-serif']=['SimHei'] #确保显示中文
plt.rcParams['axes.unicode_minus'] = False #确保显示负数的参数设置
chart1 = turnovers_n
chart1['公司简称'] = turnovers_n.index
chart1.index = [i for i in range(len(chart1))]
chart1['mean'] = chart1.iloc[:,:10].apply(lambda x: x.sum()/10, axis=1)
chart1 = chart1.sort_values('mean', ascending=False)[:10]
chart1.iloc[:,:10] = chart1.iloc[:,:10]/100000000
i = 0
plt.plot(chart1.columns[:10], chart1.iloc[i,:10], marker='o')
plt.xticks(np.linspace(2012,2021,10))
plt.xlabel('年份',fontsize=13)
plt.ylabel('营业收入(亿元)',fontsize=11)
plt.title(chart1.iloc[i,10]+"营业收入折线图",fontsize=14)
plt.show()
各家公司2012-2021年度营业收入变化趋势图
#绘制逐年营业收入对比图
chart2 = pes_n
chart2['公司简称'] = pes_n.index
chart2.index = [i for i in range(len(chart2))]
chart2['mean'] = chart2.iloc[:,:10].apply(lambda x: x.sum()/10, axis=1)
chart2 = chart2.sort_values('mean', ascending=False)[:10]
year = 2021
item = pd.concat([turnovers_n[year], turnovers_n['公司简称']], axis=1)
item[year] = item[year]/100000000
item = item.sort_values(year, ascending=False).iloc[:10]
plt.bar(item['公司简称'],height=item[year],width=0.5,bottom=2.0,)
plt.title(str(year)+'年营业收入分布柱状图')
plt.ylabel('营业收入(亿元)',fontsize=11)
plt.xticks(rotation=45)
plt.show()
逐年营业收入对比图
#绘制逐年每股收益对比图
year = 2021
item = pd.concat([turnovers_n[year], turnovers_n['公司简称']], axis=1)
item[year] = item[year]/100000000
item = item.sort_values(year, ascending=False).iloc[:10]
plt.bar(item['公司简称'],height=item[year],width=0.5,bottom=2.0,)
plt.title(str(year)+'年基本每股收益分布柱状图')
plt.ylabel('基本每股收益',fontsize=11)
plt.xticks(rotation=45)
plt.show()
每年度各公司营业收入与每股收益对比
行业解读
根据对电信、广播电视和卫星传输服务行业近十年的营业收入、每股收益数据的分析,可以清楚发现大部分公司在疫情前呈现飞速发展的态势,但是受疫情影响,在线业务的开展和对信息传媒等业务或多或少还是受到了不小的影响,像电广传媒、东方明珠、歌华有线等行业龙头公司都有一个明显的下滑趋势,但是像广电网络、宜宾世纪却呈现上升趋势,可能是由于受疫情影响,在线业务的开展和对宽带、流量、连接的依赖增长,加速宽带业务、数据业务、创新业务收入较高增长,是推动电信网络收入增长重要因素。