小组(陈智诚、李禹含、朱健勇)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
#寻找恒基达鑫年报
browser = webdriver.Edge()
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
browser.find_element(By.CSS_SELECTOR, ".g-conbox").click()
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("恒基达鑫")
browser.find_element(By.CSS_SELECTOR, "strong").click()
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("宏川智慧")
browser.find_element(By.CSS_SELECTOR, "strong").click()
browser.find_element(By.CSS_SELECTOR, ".mainquery-container").click()
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("新宁物流")
browser.find_element(By.CSS_SELECTOR, "strong").click()
browser.find_element(By.CSS_SELECTOR, ".mainquery-container").click()
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("飞力达")
browser.find_element(By.CSS_SELECTOR, ".active:nth-child(1) > a").click()
browser.find_element(By.ID, "input_code").send_keys("海晨股份")
browser.find_element(By.CSS_SELECTOR, "strong").click()
browser.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
browser.find_element(By.LINK_TEXT, "年度报告").click()
browser.find_element(By.LINK_TEXT, "年度报告").click()
browser.find_element(By.CSS_SELECTOR, ".input-left").click()
browser.find_element(By.LINK_TEXT, "2022年").click()
browser.find_element(By.CSS_SELECTOR, ".active li:nth-child(112)").click()
browser.find_element(By.LINK_TEXT, "6月").click()
browser.find_element(By.CSS_SELECTOR, ".active > .dropdown-menu li:nth-child(1)").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 tr:nth-child(1) > .weekend:nth-child(7) > .tdcontainer").click()
browser.find_element(By.CSS_SELECTOR, ".selected:nth-child(7) > .tdcontainer").click()
browser.find_element(By.CSS_SELECTOR, ".selected:nth-child(7) > .tdcontainer").click()
browser.find_element(By.CSS_SELECTOR, ".today > .tdcontainer").click()
browser.find_element(By.ID, "query-btn").click()
element = browser.find_element(By.ID, 'disclosure-table')
innerHTML = element.get_attribute('innerHTML')
f = open('innerHTML.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()
browser.quit()
#寻找宏川智慧和海晨股份年报
browser = webdriver.Edge()
browser.get("http://www.szse.cn/disclosure/listed/fixed/index.html")
browser.set_window_size(1552, 840)
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("宏川智慧")
browser.find_element(By.CSS_SELECTOR, "strong").click()
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("海晨股份")
browser.find_element(By.CSS_SELECTOR, ".active:nth-child(1) > a").click()
browser.execute_script("window.scrollTo(0,497.6000061035156)")
browser.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
browser.find_element(By.LINK_TEXT, "年度报告").click()
browser.find_element(By.CSS_SELECTOR, ".input-left").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-year span").click()
browser.find_element(By.CSS_SELECTOR, ".active li:nth-child(112)").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-month").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-month").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-month .glyphicon").click()
browser.find_element(By.CSS_SELECTOR, ".active > .dropdown-menu li:nth-child(3)").click()
browser.find_element(By.CSS_SELECTOR, ".today > .tdcontainer").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-2 tr:nth-child(3) > .available:nth-child(2) > .tdcontainer").click()
browser.find_element(By.ID, "query-btn").click()
element = browser.find_element(By.ID, 'disclosure-table')
innerHTML = element.get_attribute('innerHTML')
f = open('innerHTML.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()
browser.quit()
#寻找飞力达年报
browser = webdriver.Edge()
browser.get("http://www.szse.cn/disclosure/listed/fixed/index.html")
browser.set_window_size(1552, 840)
browser.find_element(By.CSS_SELECTOR, ".mainquery-container").click()
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("飞力达")
browser.find_element(By.CSS_SELECTOR, ".active:nth-child(1) > a").click()
browser.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
browser.find_element(By.LINK_TEXT, "年度报告").click()
browser.find_element(By.CSS_SELECTOR, ".input-left").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-year span").click()
browser.find_element(By.CSS_SELECTOR, ".active li:nth-child(112)").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-month span").click()
browser.find_element(By.CSS_SELECTOR, ".active > .dropdown-menu li:nth-child(2)").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-2 tr:nth-child(2) > .available:nth-child(2) > .tdcontainer").click()
browser.find_element(By.ID, "query-btn").click()
element = browser.find_element(By.ID, 'disclosure-table')
innerHTML = element.get_attribute('innerHTML')
f = open('innerHTML.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()
browser.quit()
#寻找新宁物流年报
browser = webdriver.Edge()
browser.get("http://www.szse.cn/disclosure/listed/fixed/index.html")
browser.set_window_size(1552, 840)
browser.find_element(By.ID, "input_code").click()
browser.find_element(By.ID, "input_code").send_keys("新宁物流")
browser.find_element(By.CSS_SELECTOR, ".active:nth-child(1) > a").click()
browser.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
browser.find_element(By.LINK_TEXT, "年度报告").click()
browser.find_element(By.CSS_SELECTOR, ".input-left").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-year span").click()
browser.find_element(By.CSS_SELECTOR, ".active li:nth-child(112)").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-month span").click()
browser.find_element(By.CSS_SELECTOR, ".active > .dropdown-menu li:nth-child(3)").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 tr:nth-child(1) > .available:nth-child(6) > .tdcontainer").click()
browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-2 tr:nth-child(1) > .available:nth-child(4) > .tdcontainer").click()
browser.find_element(By.ID, "query-btn").click()
element = browser.find_element(By.ID, 'disclosure-table')
innerHTML = element.get_attribute('innerHTML')
f = open('innerHTML.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()
browser.quit()
from bs4 import BeautifulSoup
import re
import pandas as pd
def to_pretty(fhtml):
f = open(fhtml,encoding='utf-8')
html = f.read()
f.close()
soup = BeautifulSoup(html)
html_prettified = soup.prettify()
f = open(fhtml[0:-5]+'-prettified.html', 'w', encoding='utf-8')
f.write(html_prettified)
f.close()
return(html_prettified)
html = to_pretty('innerHTML.html')
def txt_to_df(html):
# html table text to DataFrame
p = re.compile('(.*?) ', re.DOTALL)
trs = p.findall(html)
p2 = re.compile('(.*?)', re.DOTALL)
tds = [p2.findall(tr) for tr in trs[1:]]
df = pd.DataFrame({'证券代码': [td[0] for td in tds],
'简称': [td[1] for td in tds],
'公告标题': [td[2] for td in tds],
'公告时间': [td[3] for td in tds]})
return(df)
df_txt = txt_to_df(html)
p_a = re.compile('(.*?)', re.DOTALL)
p_span = re.compile('(.*?)', re.DOTALL)
get_code = lambda txt: p_a.search(txt).group(1).strip()
get_time = lambda txt: p_span.search(txt).group(1).strip()
def get_link(txt):
p_txt = '(.*?)'
p = re.compile(p_txt, re.DOTALL)
matchObj = p.search(txt)
attachpath = matchObj.group(1).strip()
href = matchObj.group(2).strip()
title = matchObj.group(3).strip()
return([attachpath, href, title])
def get_data(df_txt):
prefix = 'https://disc.szse.cn/download'
prefix_href = 'https://www.szse.cn/'
df = df_txt
codes = [get_code(td) for td in df['证券代码']]
short_names = [get_code(td) for td in df['简称']]
ahts = [get_link(td) for td in df['公告标题']]
times = [get_time(td) for td in df['公告时间']]
df = pd.DataFrame({'证券代码': codes,
'简称': short_names,
'公告标题': [aht[2] for aht in ahts],
'attachpath': [prefix + aht[0] for aht in ahts],
'href': [prefix_href + aht[1] for aht in ahts],
'公告时间': times
})
return(df)
df_data = get_data(df_txt)
df_data.to_csv('data_新宁物流.csv')
import fitz
import re
import pandas as pd
doc = fitz.open('行业分类.pdf')
txt = doc[84].get_text()
p1 = re.compile('(?<=\n)仓储业\n(.*)(?=邮政)',re.DOTALL)
a=p1.findall(txt)
print(a)
p2 = re.compile('(?:(\d+)\n(\w+)\n){1}')
b=p2.findall(a[0])
df = pd.DataFrame({'证券代码': [b[0] for b in b],
'简称': [b[1] for b in b]})
df.to_csv('仓储业.csv')
#下载年报
import fitz
import re
import pandas as pd
import requests
df = pd.read_csv('data_新宁物流.csv',header=0,index_col=0)
df['f_name']=df.iloc[:,1]+df.iloc[:,2]
df=df[['f_name','attachpath','简称']]
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任','取消','英文'],df,include=False)
df_original = filter_links(['(','('], df_all)
company=[]
for i in df_all['简称']:
if i not in company:
df_all[df_all['简称']==i].to_csv(i+'.csv')#把DATA中的不同公司年报下载链接分别提取出来成“公司名称+.csv”
company.append(i)
for i in company:
df=pd.read_csv(i+'.csv',header=0)
n=len(df['attachpath'])
for j in range(0,n):
href=df['attachpath'][j]
r = requests.get(href, allow_redirects=True)
f = open(df['f_name'][j]+'.pdf', 'wb')
f.write(r.content)
f.close()
r.close()
year_list=[]
sales_list=[]
pe_list=[]
list1=[]
list2=[]
#提取营业收入和基本每股收益数据
df_com = pd.read_csv('仓储业.csv',header=0,index_col=0)
for j in df_com['简称']:
com=pd.read_csv(j+'.csv',header=0,index_col=0)
for k in com['f_name']:
p_year = re.compile('\d{4}')
year = p_year.findall(k)
year1 = int(year[0])
if year1>2011:
year_list.append(year1)
doc = fitz.open(k+'.pdf')
for i in range(20):
page = doc[i]
txt =page.get_text()
p_sales = re.compile('(?<=\n)营业总?收入(?\w?)?\s?\n?([\d+,.]*)\s\n?',re.DOTALL)
a=p_sales.findall(txt)
if a !=[]:
c=len(a[0])
if c>0:
sales = re.sub(',','',a[0])
sales_list.append(sales)
list2.append(k)
break
for i in range(20):
page = doc[i]
txt =page.get_text()
p_pe = re.compile('(?<=\n)基本每股收益\s?\n?(?:(元/?/?╱?\n?股))?\s?\n?([-\d+,.]*)\s?\n?',re.DOTALL)
b = p_pe.findall(txt)
if b !=[]:
d=len(b[0])
if d>0 and ',' not in b[0]:
pe_list.append(b[0])
list1.append(k)
break
sales_list = [ float(x) for x in sales_list ]
pe_list = [ float(x) for x in pe_list ]
dff=pd.DataFrame({'年份':year_list,
'营业收入(元)':sales_list,
'基本每股收益(元/股)':pe_list})
dff.to_csv(j+'data.csv',encoding='utf-8-sig')
year_list=[]
sales_list=[]
pe_list=[]
add_list=[]
web_list=[]
for i in df_com['简称']:
doc = fitz.open(i+'2021年年度报告.pdf')
for j in range(15):
page = doc[j]
txt = page.get_text()
p_add = re.compile('(?<=\n)\w*办公地址:?\s?\n?(.*?)\s?(?=\n)',re.DOTALL)
com_add=p_add.findall(txt)
if com_add !=[]:
n = len(com_add[0])
if n >1:
add_list.append(com_add[0])
break
for i in df_com['简称']:
doc = fitz.open(i+'2021年年度报告.pdf')
for j in range(15):
page = doc[j]
txt = page.get_text()
p_web =re.compile('(?<=\n)公司\w*网址:?\s?\n?([a-z-A-Z./:\d+]*)\s?(?=\n)',re.DOTALL)
com_web=p_web.findall(txt)
if com_web !=[]:
n = len(com_web[0])
if n >1:
web_list.append(com_web[0])
list1.append(i)
break
df_com['办公地址'] = add_list
df_com['公司网址'] = web_list
df_com.to_csv('公司基本信息表.csv')
#画图
df_sale = pd.DataFrame()
for j in df_com['简称']:
df = pd.read_csv(j+'data.csv',index_col=1,header=0)
df = df.drop(df.columns[df.columns.str.contains('unnamed',case=False)],axis=1)
df_sales = pd.DataFrame(df['营业收入(元)'])
df_sales = df_sales.rename(columns={'营业收入(元)':j})
df_sale = pd.concat([df_sale,df_sales],axis=1)
df_eps = pd.DataFrame()
for j in df_com['简称']:
df = pd.read_csv(j+'data.csv',index_col=1,header=0)
df = df.drop(df.columns[df.columns.str.contains('unnamed',case=False)],axis=1)
df_epss = pd.DataFrame(df['基本每股收益(元/股)'])
df_epss = df_epss.rename(columns={'基本每股收益(元/股)':j})
df_eps = pd.concat([df_eps,df_epss],axis=1)
from matplotlib import pyplot as plt
plt.plot(df_sale,marker='*')
plt.legend(df_sale.columns)
plt.title('行业上市公司营业收入时间序列图')
plt.xlabel('年份')
plt.ylabel('营业收入(元)')
plt.grid()
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.plot(df_eps,marker='*')
plt.legend(df_eps.columns)
plt.title('行业上市公司基本每股收益时间序列图')
plt.xlabel('年份')
plt.ylabel('基本每股收益(元/股)')
plt.grid()
import numpy as np
from matplotlib import pyplot as plt
bar_width = 0.15
bar_1 = np.arange(len(df_sale.index))
plt.bar(bar_1-2*bar_width, df_sale.iloc[:,0], width=bar_width, label=df_sale.columns[0])
plt.bar(bar_1-bar_width, df_sale.iloc[:,1], width=bar_width, label=df_sale.columns[1])
plt.bar(bar_1, df_sale.iloc[:,2], width=bar_width, label=df_sale.columns[2])
plt.bar(bar_1+bar_width, df_sale.iloc[:,3], width=bar_width, label=df_sale.columns[3])
plt.bar(bar_1+2*bar_width, df_sale.iloc[:,4], width=bar_width, label=df_sale.columns[4])
plt.xticks(bar_1, labels=df_sale.index)
plt.title('行业上市公司营业收入对比时间序列图')
plt.xlabel('年份')
plt.ylabel('营业收入(元)')
plt.grid()
plt.legend()
bar_1 = np.arange(len(df_eps.index))
bar_width = 0.15
plt.bar(bar_1-2*bar_width, df_eps.iloc[:,0], width=bar_width, label=df_eps.columns[0])
plt.bar(bar_1-bar_width, df_eps.iloc[:,1], width=bar_width, label=df_eps.columns[1])
plt.bar(bar_1, df_eps.iloc[:,2], width=bar_width, label=df_eps.columns[2])
plt.bar(bar_1+bar_width, df_eps.iloc[:,3], width=bar_width, label=df_eps.columns[3])
plt.bar(bar_1+2*bar_width, df_eps.iloc[:,4], width=bar_width, label=df_eps.columns[4])
plt.xticks(bar_1, labels=df_eps.index)
plt.title('行业上市公司基本每股收益对比时间序列图')
plt.xlabel('年份')
plt.ylabel('基本每股收益(元/股)')
plt.grid()
plt.legend()
解读 由图标可以看出,中储股份在仓储行业中领先于其他企业, 是行业中的龙头。近年来受疫情影响,每股收益有所下降, 十年间行业整体呈上升趋势,收入逐渐上升。对于我国来 说,充分利用已有的仓储资源的仓储社会化,提高仓储效率和仓储业分工 发展的专业化功能,加速满足社会生产发展和促进物流效 率提高的仓储标准化,提高仓储自身效率,实现仓储管理 的现代化,行业会有更好的发展。