import requests
import re
import pandas as pd
import os
import fitz
import time
df = pd.read_csv('中药行业.csv')
#标准化年报文件名
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
ls.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2=df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','(',],df_all,include=[True])
df_updt = filter_links(['取消'], df_updt,include=[False])
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig=[]
index_updt=[]
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return ((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.i;oc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
# index_orig,index_updt = sub_with_update(df_updt, df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
<ipython-input-5-5cd6300745ff>:26: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn=df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv=os.listdir('10companies')
os.chdir('C:/Users/Jan/.spyder-py3/homework4/10companies')
f_1=os.listdir()
f_1.remove(f_1[0])
f_1.remove(f_1[4])
links= []
f_names=[]
def get_PDF_url(url):
r = requests.get(url);r.encoding = 'utf-8'; html = r.text
r.close() # 已获取html内容,结束connection
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
a = p.search(html) # 因第一个<a>即是目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href # 形成完整的链接
return((href,fname))
hrefs=[];fnames=[]
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links.to_csv('中药links.csv')
df_final_links=pd.read_csv('C:/Users/Jan/.spyder-py3/homework4/10companies/中药links.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
href=hrefs[i];f_name=f_names[i]
r = requests.get(href, allow_redirects=True)
open('%s' %f_name, 'wb').write(r.content)
time.sleep(10)
r.close()
import pdfplumber
pdf = pdfplumber.open("600518康美药业2020年年度报告.pdf")
first_page = pdf.pages[9]
table = first_page.extract_table()
table
[['', '第一季度 \n(1-3月份)', '第二季度 \n(4-6月份)', '第三季度 \n(7-9月份)', '第四季度 \n(10-12月份)'], ['营业收入', '1,133,591,931.71', '1,383,223,297.49', '1,454,766,683.28', '1,440,426,049.18'], ['归属于上市公司股东的净利润', '-542,470,073.45', '-881,372,238.52', '-947,691,532.32', '-25,364,409,578.30'], ['归属于上市公司股东的扣除非\n经常性损益后的净利润', '-323,384,339.40', '-1,148,021,611.79', '-955,838,002.34', '-24,809,783,944.14'], ['经营活动产生的现金流量净额', '216,103,610.97', '155,287,699.53', '513,297,751.60', '146,703,039.5']]
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
num_list=[ '11.33,591,93171','13.83,223,29749','14.54,766,68328','14.40,426,04918'] #数据
name_list=['2020第一季度','2020第二季度','2020第三季度','2020第四季度']
plt.bar(range(len(num_list)), num_list,color='brown',tick_label=name_list)
plt.ylabel('营业收入(单位:亿元)')
plt.title('康美药业营业收入走势图')
plt.show()
import pdfplumber
pdf = pdfplumber.open("新天药业:2020年年度报告.pdf")
first_page = pdf.pages[7]
table = first_page.extract_table()
table
[['', '第一季度', '第二季度', '第三季度', '第四季度'], ['营业收入', '100,379,992.91', '202,482,314.25', '238,999,443.50', '209,084,639.65'], ['归属于上市公司股东的净利润', '3,113,965.50', '22,090,393.72', '35,851,694.95', '13,047,156.14'], ['归属于上市公司股东的扣除非经', '', '', '', ''], [None, '4,182,758.45', '20,292,630.05', '33,859,689.54', '12,641,987.14'], ['常性损益的净利润', None, None, None, None], [None, '', '', '', ''], ['经营活动产生的现金流量净额', '35,411,350.35', '26,175,062.10', '38,798,247.60', '75,382,867.11']]
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
num_list=[ '1.00,379,99291','2.02,482,31425','2.38,999,44350','2.09,084,63965'] #数据
name_list=['2020第一季度','2020第二季度','2020第三季度','2020第四季度']
plt.bar(range(len(num_list)), num_list,color='brown',tick_label=name_list)
plt.ylabel('营业收入(单位:亿元)')
plt.title('新天药业营业收入走势图')
plt.show()
import pdfplumber
pdf = pdfplumber.open("贵州百灵:2020年年度报告.pdf")
first_page = pdf.pages[6]
table = first_page.extract_table()
table
[['', '', '', None, '本年比上年增', '', None], [None, None, '2019年', None, None, '2018年', None], ['', '2020年', None, None, '减', None, None], [None, None, '', None, None, '', None], ['', '', '调整前', '调整后', '调整后', '调整前', '调整后'], ['', '3,087,888,201.\n57', '2,850,585,250.', '2,850,585,250.\n 74', '', '3,136,843,231.', '3,136,843,231.'], ['营业收入(元)', None, None, None, '8.32%', None, None], [None, None, '74', None, None, '96', '96'], ['', None, None, None, '', None, None], ['归属于上市公司股东的净利', '152,375,357.74', '', '282,748,962.12', '', '', ''], [None, None, '291,384,226.24', None, '-46.11%', '563,238,885.60', '563,238,885.60'], ['润(元)', None, None, None, None, None, None], [None, None, '', None, '', '', ''], ['归属于上市公司股东的扣除', '', '', '', '', '', ''], [None, '76,068,232.38', '230,240,541.18', '221,605,277.06', '-65.67%', '555,185,325.81', '555,185,325.81'], ['非经常性损益的净利润(元)', None, None, None, None, None, None], [None, '', '', '', '', '', ''], ['经营活动产生的现金流量净', '231,779,655.70', '', '467,413,827.87', '', '-157,445,351.9', '-157,445,351.9'], [None, None, '467,413,827.87', None, '-50.41%', None, None], ['额(元)', None, None, None, None, '6', '6'], [None, None, '', None, '', None, None], ['基本每股收益(元/股)', '0.11', '0.21', '0.20', '-45.00%', '0.40', '0.40'], ['稀释每股收益(元/股)', '0.11', '0.21', '0.20', '-45.00%', '0.39', '0.39'], ['加权平均净资产收益率', '3.84%', '7.34%', '7.13%', '-3.29%', '15.53%', '15.53%'], ['', '2020年末', '2019年末', None, '本年末比上年', '2018年末', None]]
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
num_list=['30.87,888,20157','28.50,585,25074','31.36,843,23196'] #数据
name_list=['2020','2019','2018']
plt.bar(range(len(num_list)), num_list,color='brown',tick_label=name_list)
plt.ylabel('营业收入(单位:亿元)')
plt.title('贵州百灵营业收入走势图')
plt.show()