生成行业csv


import pandas as pd
import openpyxl
import re

xlsx = '饮料行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']

links = [c.value for c in C]

link_1=links[1:-1]

link_2=''.join(link_1)

p= re.compile('"(.*?)","(.*?)"')
list_of_tuple=p.findall(link_2)

df2=pd.DataFrame({'link':[t[0]for t in list_of_tuple],
                  'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('饮料行业.csv')


选取十家公司


import re
import pandas as pd
import os


df=pd.read_csv('饮料行业.csv')
p=re.compile('(?<=\d{4})(年报)｜(年年报)')
f_names=[p.sub(('年年度报'), f)for f in df.f_name]
df['f_name']=f_names;del p,f_names

def filter_links(words,df,include=True):
    Is=[]
    for word in words:
        if include:
            Is.append([word in f for f in df.f_name])
        else:
            Is.append([word not in f for f in df.f_name])
    index=[]
    for r in range(len(df)):
        flag=not include
        for c in range(len(words)):
            if include:
                flag=flag or Is[c][r]
            else:
                flag=flag and Is[c][r]
        index.append(flag)
    df2=df[index]
    return(df2)

df_all=filter_links(['摘要','询问函','社会责任','审计','财务','风险','债券'],df,include=False)
df_orig=filter_links(['(','('], df_all,include=False)
df_updt=filter_links(['(','('], df_all,include=True)
df_updt=filter_links(['取消'],df_updt,include=False)

def sub_with_update(df_updt,df_orig):
    df_newest=df_orig.copy()
    index_orig=[]
    index_updt=[]
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:
                index_orig.append(i)
                index_updt.append(j)
    for n in range(len(index_orig)):
        i=index_orig[n]
        j=index_updt[n]
        df_newest.iloc[i,-2]=df_updt.iloc[j,-1]
    return(df_newest)

df_newest=sub_with_update(df_updt,df_orig)

df_newest.sort_values(by=['f_name'],
                      inplace=True,
                      ignore_index=True)
df_newest['公司简称']=[f[:4]for f in df_newest.f_name]

counts=df_newest['公司简称'].value_counts()

ten_company=[]
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn], df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn=df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv'%cn)
    
ten_csv=os.listdir('10companies')
    

df_all.sort_values(by=['f_name'],inplace=True,
                   ignore_index=True)


获取链接


import pandas as pd
import re
import requests
import os 
import time

df=pd.read_csv('饮料行业.csv')
links=df['link'];f_names=df['f_name']


def get_PDF_url(url):
    r = requests.get(url); r.encoding = 'utf-8'; html = r.text
    r.close()# 已获取html内容，结束connection
    p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
    a = p.search(html) #因为第一个<a>即是目标标签，故用search
    if a is None:
        Warning('没有找到下载链接，请手动下载：%s' % url)
        return()
    else:
        href = a.group(1); fname = a.group(2).strip()
    href = r.url[:26]+href #形成完整的链接
    return((href,fname))

hrefs=[];fnames=[]
for link in links:
    href,fname = get_PDF_url(link)
    hrefs.append(href)
    fnames.append(fname)
    time.sleep(10)
df_final_links=pd.DataFrame({'href':hrefs,
                             'f_name':fnames})
df_final_links.to_csv('final_links_饮料行业.csv')


df_final_links=pd.read_csv('final_links_饮料行业.csv')
hrefs=df_final_links['href']
for href in hrefs:
    r = requests.get(href,allow_redirects = True)
    open('%s'%fname,'wb').write(r.content)
    time.sleep(10)
r.close()

url = 'http://news.windin.com/ns/bulletin.php?code=8bbfedc5d806&id=1360724&type=1'


下载PDF


import pandas as pd
import requests
import time

df_final_links=pd.read_csv('final_links_饮料行业.csv')
hrefs=df_final_links['href']
f_names=df_final_links['f_name']

for i in  range(len(hrefs)):
    href=hrefs[i];f_names[i]
    r = requests.get(href,allow_redirects = True)
    open('%s'%fname,'wb').write(r.content)
    time.sleep(10)
r.close()


承德露露


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[18,19,21,26,27,27,25,21,21,22]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('000848')

plt.show()


古越龙山


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[1.2,1.7,1.9,1.4,1.8,1.3,1.2,1.6,1.7,2.1]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('600059')

plt.show()


海南椰岛


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[1.9,0.4,1.5,1.3,0.4,0.1,-0.3,-1,0.4,-2.6]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('600238')

plt.show()


惠泉啤酒


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[0.5,0.3,-0.7,0.2,0.3,0.2,0.01,0.2,0.2,0.2]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('600573')

plt.show()


金枫酒业


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[1.3,1.4,1.0,1.1,0.7,0.6,0.5,-0.6,0.3,0.1]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('600616')

plt.show()


莫高股份


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[0.4,0.5,0.5,-0.8,0.1,0.2,0.25,0.27,0.27,0.01]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('600543')

plt.show()


青岛啤酒


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[15.2,17.4,17.6,19.7,19.9,17.1,10.4,12.6,14.2,22.1]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('600600')

plt.show()


维维股份


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[47.6,53.7,58.1,50.6,44.6,38.8,44.6,46.5,50.3,50.4]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('600300')

plt.show()


张裕A


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[49.8,60.2,56.4,43.2,41.5,46.5,47.1,49.3,50.7,33.9]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('000869')

plt.show()


珠江啤酒


import numpy as np 
from matplotlib import pyplot as plt 

x = np.arange(2010,2020) 
y =[30.5,35.6,34.7,33.5,37.6,40.3,42.4,40.4,42.4,42.5]

plt.plot(x,y)

plt.xlabel('year')
plt.ylabel('total operatinf income')
plt.title('002461')

plt.show()