由图表可以看出,凤凰传媒(代码:601928)在新闻和出版行业中领先于其他企业 。近年来受疫情影响,营业收入以及归属于上市公司的净利润有所下降,十年间行业 整体呈下降趋势。近两年来,新闻和出版行业经历了一系列的发展和变革,鉴于近几年 营业收入的下降,新闻行业应该加快数字化的步伐,实现媒体多元化,关注数据新闻的 兴起,加快出版行业的数字化转型,尽快实现营业收入上升的趋势。
'''1.从网站获取年报链接'''
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import re
import pandas as pd
Codes=['601811','601858','601900','601921','601928','601949','601999','603096','603999','605577']
def get_table_sse(code):
browser = webdriver.Edge()
url='http://www.sse.com.cn/disclosure/listedinfo/regular/'
browser.get(url)
browser.set_window_size(1550, 830)
time.sleep(3)
browser.find_element(By.ID, "inputCode").click()
browser.find_element(By.ID, "inputCode").send_keys(code) #'601919'
time.sleep(3)
selector='.sse_outerItem:nth-child(4) .filter-option-inner-inner' #???
browser.find_element(By.CSS_SELECTOR,selector).click()
browser.find_element(By.LINK_TEXT, "年报").click()
time.sleep(3)
selector = "body > div.container.sse_content > div > "
selector += "div.col-lg-9.col-xxl-10 > div > "
selector += "div.sse_colContent.js_regular > "
selector += "div.table-responsive > table"
element = browser.find_element(By.CSS_SELECTOR,selector)
table_html = element.get_attribute('innerHTML')
fname=f'{code}.html'
f = open(fname,'w',encoding='utf-8')
f.write(table_html)
f.close()
browser.quit()
def get_table_sse_codes(codes):
for code in codes:
get_table_sse(code)
def get_data(tr):
p_td = re.compile('(.*?)', re.DOTALL)
tds = p_td.findall(tr)
s = tds[0].find('>') + 1
e = tds[0].rfind('<')
code = tds[0][s:e]
s = tds[1].find('>') + 1
e = tds[1].rfind('<')
name = tds[1][s:e]
s = tds[2].find('href="') + 6
e = tds[2].find('.pdf"') + 4
href = 'http://www.sse.com.cn' + tds[2][s:e]
s = tds[2].find('$(this))">') + 10
e = tds[2].find('')
title = tds[2][s:e]
date = tds[3].strip()
data = [code,name,href,title,date]
return(data)
def parse_table(fname,save=True):
f=open(fname,encoding='utf-8')
html=f.read()
f.close()
p = re.compile('(.+?) ', re.DOTALL)
trs = p.findall(html)
trs_new = []
for tr in trs:
if tr.strip() != '':
trs_new.append(tr)
data_all = [get_data(tr) for tr in trs_new[1:]]
df = pd.DataFrame({
'code': [d[0] for d in data_all],
'name': [d[1] for d in data_all],
'href': [d[2] for d in data_all],
'title': [d[3] for d in data_all],
'date': [d[4] for d in data_all]
})
if save:
df.to_csv(f'{fname[0:-5]}.csv')
return(df)
get_table_sse_codes(Codes)
df1=parse_table('601811.html')
df2=parse_table('601858.html')
df3=parse_table('601900.html')
df4=parse_table('601921.html')
df5=parse_table('601928.html')
df6=parse_table('601949.html')
df7=parse_table('601999.html')
df8=parse_table('603096.html')
df9=parse_table('603999.html')
df10=parse_table('605577.html')
'''2.过滤掉一些不重要的的公告链接'''
import time
def filter_links(words,df,include=True):
ls=[]
for word in words:
if include:
ls.append([word in f for f in df['title']])
else:
ls.append([word not in f for f in df['title']])
index=[]
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
def filter_date(start,end,df):
date = df['date']
v = [d>=start and d<= end for d in date]
df_new = df[v]
return(df_new)
import datetime
def start_end_10y():
dt_now = datetime.datetime.now()
current_year = dt_now.year
start = f'{current_year-9}-01-01'
end = f'{current_year}-12-31'
return((start,end))
def filter_nb_10y(df,
keep_words=['年报','年度报告'],
exclude_words=['摘要','修订稿','持续督导'],
start=''):
if start == '':
start,end = start_end_10y()
else:
start_y = int(start[0:4])
end = f'{start_y + 9}-12-31'
df = filter_links(keep_words, df,include=True)
df = filter_links(exclude_words, df,include=False)
df = filter_date(start,end,df)
return(df)
df_all=[df1,df2,df3,df4,df5,df6,df7,df8,df9,df10]
df_all_n=[]
for i in df_all:
df_all_n.append(filter_nb_10y(i,
keep_words=['年报','年度报告'],
exclude_words=['摘要','修订稿','持续督导'],
start=''))
'''3.下载年报'''
import requests
def download_pdf(href, code, year):
r = requests.get(href,allow_redirects=True)
fname = f'{code}_{year}.pdf'
f = open(fname,'wb')
f.write(r.content)
f.close
r.close
def download_pdfs(hrefs,code,years):
for i in range(len(hrefs)):
href = hrefs[i]
year = years[i]
download_pdf(href,code,year)
time.sleep(30)
return()
def download_pdfs_codes(list_hrefs,codes,list_years):
for i in range(len(list_hrefs)):
hrefs = list_hrefs[i]
years = list_years[i]
code = codes[i]
download_pdfs(hrefs, code, years)
return()
hrefs=[]
for i in range(10):
hrefs.append(list(df_all_n[i]['href']))
years=[]
for i in range(10):
years.append(list(df_all_n[i]['date']))
download_pdfs_codes(hrefs,Codes,years)
'''4.解析年报'''
import fitz
import pandas as pd
import re
# 获取上市公司的营业收入和归属于上市公司股东的净利润
def get_th_span(txt):
nianfen='(20\d\d|199\d)\s*年末?'
s = f'{nianfen}\s*{nianfen}.*?{nianfen}'
p = re.compile(s,re.DOTALL)
matchobj = p.search(txt)
end = matchobj.end()
year1 = matchobj.group(1)
year2 = matchobj.group(2)
year3 = matchobj.group(3)
flag=(int(year1) - int(year2) ==1) and (int(year2) - int(year3) ==1)
while (not flag):
matchobj = p.search(txt[end:])
end = matchobj.end()
year1 = matchobj.group(1)
year2 = matchobj.group(2)
year3 = matchobj.group(3)
flag=(int(year1) - int(year2) ==1)
flag=flag and (int(year2) - int(year3) ==1)
return(matchobj.span())
def get_bounds(txt):
th_span_1st=get_th_span(txt)
end = th_span_1st[1]
th_span_2nd = get_th_span(txt[end:])
th_span_2nd = (end+th_span_2nd[0],end+th_span_2nd[1])
#
s = th_span_1st[1]
e = th_span_2nd[0]
#
while (txt[e] not in '0123456789'):
e = e - 1
return(s,e)
def get_subtext(doc,bounds=('主要会计数据和财务指标','总资产')):
start_pageno = 0
end_pageno = len(doc) - 1
lb,ub=bounds
for n in range(len(doc)):
page = doc[n]
txt = page.get_text()
if lb in txt:
start_pageno = n; break
for n in range(start_pageno,len(doc)):
if ub in doc[n].get_text():
end_pageno = n; break
txt = ''
for n in range(start_pageno,end_pageno+1):
page = doc[n]
txt += page.get_text()
return(txt)
def parse_fin(txt):
sales=[]
income=[]
# 提取营业收入
p_sales = re.compile('营业收入\s?\n?([-\d+,.]*)\s',re.DOTALL)
p_sales=p_sales.findall(txt)
if p_sales !=[]:
sales1 = re.sub(',','',p_sales[0])
sales.append(sales1)
# 提取归属于上市公司股东的净利润
p_income = re.compile('东的净利润\s?\n?([-\d+,.]*)\s',re.DOTALL)
p_income=p_income.findall(txt)
if p_income !=[]:
income1 = re.sub(',','',p_income[0])
income.append(income1)
else:
income.append('未提取出来')
# 处理
sales=pd.DataFrame(sales)
income=pd.DataFrame(income)
agg=[]
agg=pd.DataFrame(agg)
agg['营业收入']=sales.iloc[:,0:1]
agg['归属于上市公司股东的净利润']=income.iloc[:,0:1]
return agg
codes=['601811','601858','601900','601921','601928','601949','601999','603096','603999','605577']
import requests
hh=[[],[],[],[],[],[],[],[],[],[]]
for m in range(len(codes)):
findataz=[]
colnames= ['营业收入','归属于上市公司股东的净利润']
findataz = pd.DataFrame(findataz,columns=colnames)
for i in range(len(years[m])):
filename = f'{codes[m]}_{years[m][i]}.pdf'
doc = fitz.open(filename)
txt = get_subtext(doc)
span = get_bounds(txt)
subtxt = txt[span[0]:span[1]]
fin_data=parse_fin(subtxt)
findataz = pd.concat([findataz, fin_data])
hh[m]=findataz
hh[0].iloc[0:1,1:2]='1396673063.27'
hh[2].iloc[0:1,1:2]='943358883.70'
hh[2].iloc[4:5,1:2]='655293807.85'
hh[2].iloc[5:6,1:2]='611423556.91'
hh[2].iloc[6:7,1:2]='422261959.62'
hh[3].iloc[0:1,1:2]='1413562895.84'
hh[3].iloc[1:2,1:2]='1317067651.75'
hh[4].iloc[1:2,1:2]='2456754308.90'
hh[4].iloc[4:5,1:2]='1324895653.69'
hh[5].iloc[0:1,1:2]='650808316.46'
hh[5].iloc[1:2,1:2]='779934519.78'
hh[5].iloc[2:3,1:2]='740968757.59'
hh[5].iloc[3:4,1:2]='740968757.59'
hh[5].iloc[4:5,1:2]='702771582.88'
hh[5].iloc[5:6,1:2]='601490557.41'
hh[5].iloc[6:7,1:2]='530652216.90'
hh[6].iloc[1:2,1:2]='108765384.53'
hh[6].iloc[2:3,1:2]='152712660.34'
hh[6].iloc[3:4,1:2]='148286660.15'
hh[6].iloc[4:5,1:2]='177328639.04'
hh[6].iloc[6:7,1:2]='122946719.40'
hh[6].iloc[7:8,1:2]='80189714.61'
hh[6].iloc[8:9,1:2]='75009247.64'
hh[6].iloc[9:10,1:2]='70033778.14'
hh[7].iloc[0:1,1:2]='136950321.84'
hh[7].iloc[2:3,1:2]='219689297.19'
hh[7].iloc[3:4,1:2]='240280588.57'
hh[7].iloc[4:5,1:2]='240814176.91'
hh[7].iloc[5:6,1:2]='232322877.80'
hh[8].iloc[2:3,1:2]='14270825.95'
hh[8].iloc[5:6,1:2]='74435174.79'
hh[8].iloc[7:8,1:2]='6619360.13'
# 将整理出来的字符转为浮点型
for i in range(len(hh)):
# hh[i].astype(float)
hh[i] = hh[i].apply(pd.to_numeric, errors='coerce')
hh[i] = hh[i].div(1000000000)
hh[5].index=range(7)
hh[5]=hh[5].drop(3)
for i in range(len(hh)):
hh[i].index=range(2023-len(hh[i]),2023)
import matplotlib
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif']=['FangSong']
mpl.rcParams['axes.unicode_minus']=False
for i in range(len(codes)):
plt.figure(figsize=(11,9))
plt.subplot(2, 1, 1)
plt.plot(hh[i]['营业收入'], 'r-',label=codes[i],lw=2.0)
plt.xticks(fontsize=13,rotation=30)
plt.xlabel(u'日期',fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel(u'营业收入(单位:十亿元)',fontsize=13)
plt.legend(loc=0, fontsize=13)
plt.grid()
plt.subplot(2, 1, 2)
plt.plot(hh[i]['归属于上市公司股东的净利润'], 'b-',label=codes[i],lw=2.0)
plt.xticks(fontsize=13,rotation=30)
plt.xlabel(u'日期',fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel(u'归属于上市公司股东的净利润(单位:十亿元)',fontsize=13)
plt.legend(loc=0, fontsize=13)
plt.grid()
plt.figure(figsize=(11,9))
for i in range(len(codes)):
plt.plot(hh[i]['营业收入'], label=codes[i],lw=2.0)
plt.xticks(fontsize=13,rotation=30)
plt.xlabel(u'日期',fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel(u'营业收入(单位:十亿元)',fontsize=13)
plt.legend(loc=0, fontsize=13)
plt.grid()
# 提取公司网址,电子邮箱,办公地址,董秘姓名、电话
def get_subtext_xx(doc,bounds=('公司的法定代表人','媒体名称')):
start_pageno = 0
end_pageno = len(doc) - 1
lb,ub=bounds
for n in range(len(doc)):
page = doc[n]
txt = page.get_text()
if lb in txt:
start_pageno = n; break
for n in range(start_pageno,len(doc)):
if ub in doc[n].get_text():
end_pageno = n; break
txt = ''
for n in range(start_pageno,end_pageno+1):
page = doc[n]
txt += page.get_text()
return(txt)
def parse_dm_acc(txt):
# 董秘姓名
dm_name = re.compile('证券事务代表\s+姓名\s+([\u2E80-\u9FFF]+)\s+[\u2E80-\u9FFF]',re.DOTALL)
dm_name=dm_name.findall(txt)
# 提取董秘电话
dm_pho = re.compile('电话\s?\n?([-\d+,.]*)\s',re.DOTALL)
dm_pho = dm_pho.findall(txt)
# 提取公司网址
acc_her = re.compile('(?<=公司网址)(.*?)(?=电子信箱)',re.DOTALL)
acc_her = acc_her.findall(txt)
# 提取电子邮箱
acc_ema = re.compile('(?<=公司网址).*?(?<=电子信箱)\s+(.*?)\s+(?=四、 信息披露及备置地点)',re.DOTALL)
acc_ema = acc_ema.findall(txt)
# 提取办公地址
acc_sta = re.compile('(?<=办公地址)(.*?)(?=公司办公地址的)',re.DOTALL)
acc_sta = acc_sta.findall(txt)
# 处理
dm_acc=pd.DataFrame(index=range(1))
dm_acc['董秘姓名']=dm_name
dm_acc['董秘电话']=dm_pho
dm_acc['公司网址']=acc_her
dm_acc['电子邮箱']=acc_ema
dm_acc['办公地址']=acc_sta
return dm_acc
d_a=[]
colname= ['董秘姓名','董秘电话','公司网址','电子邮箱','办公地址']
d_a = pd.DataFrame(d_a,columns=colname)
for m in range(len(codes)):
filename = f'{codes[m]}_{years[m][1]}.pdf'
doc = fitz.open(filename)
txt = get_subtext_xx(doc)
d_a_data=parse_dm_acc(txt)
d_a = pd.concat([d_a, d_a_data])
d_a.to_csv('公司基本信息.csv',encoding='utf_8_sig')