import requests
import re 
import pandas as pd
import os
import fitz
import time


df = pd.read_csv('中药行业.csv')
#标准化年报文件名
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names


def filter_links(words,df,include=True):
    ls = []
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    index = []
    for r in range(len(df)):
        flag  = not include
        for c in range(len(words)):
            if include:
                ls.append([word not in f for f in df.f_name])
                index=[]
                for r in range(len(df)):
                    flag=not include
                    for c in range(len(words)):
                        if include:
                            flag = flag or ls[c][r]
                        else:
                            flag = flag and ls[c][r]
                    index.append(flag)
                df2=df[index]
                return(df2)


df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','(',],df_all,include=[True])
df_updt = filter_links(['取消'], df_updt,include=[False])


def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig=[]
    index_updt=[]
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:
             index_orig.append(i)
             index_updt.append(j)
    #return ((index_orig,index_updt))
    for n in range(len(index_orig)):
        i = index_orig[n]
        j = index_updt[n]
        df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
        #df_newest.iloc[i,-1] = df_updt.i;oc[j,-1]
    return(df_newest)

df_newest  = sub_with_update(df_updt,df_orig)
# index_orig,index_updt = sub_with_update(df_updt, df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]

<ipython-input-5-5cd6300745ff>:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)


counts = df_newest['公司简称'].value_counts()

ten_company = []
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn],df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn=df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)
    
ten_csv=os.listdir('10companies')
    
os.chdir('C:/Users/Jan/.spyder-py3/homework4/10companies')
f_1=os.listdir()
f_1.remove(f_1[0])
f_1.remove(f_1[4])

links= []
f_names=[]


def get_PDF_url(url):
    r = requests.get(url);r.encoding = 'utf-8'; html = r.text
    r.close() # 已获取html内容，结束connection
    p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
    a = p.search(html) # 因第一个<a>即是目标标签，故用search
    if a is None:
        Warning('没有找到下载链接。请手动检查链接：%s' % url)
        return()
    else:
        href = a.group(1); fname = a.group(2).strip()
    href = r.url[:26] + href # 形成完整的链接
    return((href,fname))


hrefs=[];fnames=[]

for link in links:
    href,fname = get_PDF_url(link)
    hrefs.append(href)
    fnames.append(fname)
    df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
    df_final_links.to_csv('中药links.csv')


df_final_links=pd.read_csv('C:/Users/Jan/.spyder-py3/homework4/10companies/中药links.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
    href=hrefs[i];f_name=f_names[i]
    r = requests.get(href, allow_redirects=True)
    open('%s' %f_name, 'wb').write(r.content)
    time.sleep(10)
r.close()


import pdfplumber
pdf = pdfplumber.open("600518康美药业2020年年度报告.pdf") 
first_page = pdf.pages[9]
table = first_page.extract_table()
table

[['',
  '第一季度 \n（1-3月份）',
  '第二季度 \n（4-6月份）',
  '第三季度 \n（7-9月份）',
  '第四季度 \n（10-12月份）'],
 ['营业收入',
  '1,133,591,931.71',
  '1,383,223,297.49',
  '1,454,766,683.28',
  '1,440,426,049.18'],
 ['归属于上市公司股东的净利润',
  '-542,470,073.45',
  '-881,372,238.52',
  '-947,691,532.32',
  '-25,364,409,578.30'],
 ['归属于上市公司股东的扣除非\n经常性损益后的净利润',
  '-323,384,339.40',
  '-1,148,021,611.79',
  '-955,838,002.34',
  '-24,809,783,944.14'],
 ['经营活动产生的现金流量净额',
  '216,103,610.97',
  '155,287,699.53',
  '513,297,751.60',
  '146,703,039.5']]


import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

num_list=[ '11.33,591,93171','13.83,223,29749','14.54,766,68328','14.40,426,04918']  #数据

name_list=['2020第一季度','2020第二季度','2020第三季度','2020第四季度']
plt.bar(range(len(num_list)), num_list,color='brown',tick_label=name_list)
plt.ylabel('营业收入（单位：亿元)')
plt.title('康美药业营业收入走势图')
plt.show()


import pdfplumber
pdf = pdfplumber.open("新天药业：2020年年度报告.pdf") 
first_page = pdf.pages[7]
table = first_page.extract_table()
table

[['', '第一季度', '第二季度', '第三季度', '第四季度'],
 ['营业收入',
  '100,379,992.91',
  '202,482,314.25',
  '238,999,443.50',
  '209,084,639.65'],
 ['归属于上市公司股东的净利润',
  '3,113,965.50',
  '22,090,393.72',
  '35,851,694.95',
  '13,047,156.14'],
 ['归属于上市公司股东的扣除非经', '', '', '', ''],
 [None, '4,182,758.45', '20,292,630.05', '33,859,689.54', '12,641,987.14'],
 ['常性损益的净利润', None, None, None, None],
 [None, '', '', '', ''],
 ['经营活动产生的现金流量净额',
  '35,411,350.35',
  '26,175,062.10',
  '38,798,247.60',
  '75,382,867.11']]


import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

num_list=[ '1.00,379,99291','2.02,482,31425','2.38,999,44350','2.09,084,63965']  #数据

name_list=['2020第一季度','2020第二季度','2020第三季度','2020第四季度']
plt.bar(range(len(num_list)), num_list,color='brown',tick_label=name_list)
plt.ylabel('营业收入（单位：亿元)')
plt.title('新天药业营业收入走势图')
plt.show()


import pdfplumber
pdf = pdfplumber.open("贵州百灵：2020年年度报告.pdf") 
first_page = pdf.pages[6]
table = first_page.extract_table()
table

[['', '', '', None, '本年比上年增', '', None],
 [None, None, '2019年', None, None, '2018年', None],
 ['', '2020年', None, None, '减', None, None],
 [None, None, '', None, None, '', None],
 ['', '', '调整前', '调整后', '调整后', '调整前', '调整后'],
 ['',
  '3,087,888,201.\n57',
  '2,850,585,250.',
  '2,850,585,250.\n  74',
  '',
  '3,136,843,231.',
  '3,136,843,231.'],
 ['营业收入（元）', None, None, None, '8.32%', None, None],
 [None, None, '74', None, None, '96', '96'],
 ['', None, None, None, '', None, None],
 ['归属于上市公司股东的净利', '152,375,357.74', '', '282,748,962.12', '', '', ''],
 [None,
  None,
  '291,384,226.24',
  None,
  '-46.11%',
  '563,238,885.60',
  '563,238,885.60'],
 ['润（元）', None, None, None, None, None, None],
 [None, None, '', None, '', '', ''],
 ['归属于上市公司股东的扣除', '', '', '', '', '', ''],
 [None,
  '76,068,232.38',
  '230,240,541.18',
  '221,605,277.06',
  '-65.67%',
  '555,185,325.81',
  '555,185,325.81'],
 ['非经常性损益的净利润（元）', None, None, None, None, None, None],
 [None, '', '', '', '', '', ''],
 ['经营活动产生的现金流量净',
  '231,779,655.70',
  '',
  '467,413,827.87',
  '',
  '-157,445,351.9',
  '-157,445,351.9'],
 [None, None, '467,413,827.87', None, '-50.41%', None, None],
 ['额（元）', None, None, None, None, '6', '6'],
 [None, None, '', None, '', None, None],
 ['基本每股收益（元/股）', '0.11', '0.21', '0.20', '-45.00%', '0.40', '0.40'],
 ['稀释每股收益（元/股）', '0.11', '0.21', '0.20', '-45.00%', '0.39', '0.39'],
 ['加权平均净资产收益率', '3.84%', '7.34%', '7.13%', '-3.29%', '15.53%', '15.53%'],
 ['', '2020年末', '2019年末', None, '本年末比上年', '2018年末', None]]


import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

num_list=['30.87,888,20157','28.50,585,25074','31.36,843,23196']  #数据

name_list=['2020','2019','2018']
plt.bar(range(len(num_list)), num_list,color='brown',tick_label=name_list)
plt.ylabel('营业收入（单位：亿元)')
plt.title('贵州百灵营业收入走势图')
plt.show()

新天药业2020年营业收入三个季度稳步上升¶