第一步:把行业Excel变成csv

In [1]:
import pandas as pd
import openpyxl
import re

xlsx = '纺织与制造行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']

links = [c.value for c in C]

link_1=links[1:-1]

link_2=''.join(link_1)

p= re.compile('"(.*?)","(.*?)"')
list_of_tuple=p.findall(link_2)

df2=pd.DataFrame({'link':[t[0]for t in list_of_tuple],
                  'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('纺织与制造行业.csv')

第二步:从总的行业csv文件里下载10个行业的最新链接

In [2]:
import re
import pandas as pd
import os


df=pd.read_csv('纺织与制造行业.csv')
p=re.compile('(?<=\d{4})(年报)|(年年报)')        
f_names=[p.sub(('年年度报'), f)for f in df.f_name]
df['f_name']=f_names;del p,f_names

def filter_links(words,df,include=True):
    Is=[]
    for word in words:
        if include:
            Is.append([word in f for f in df.f_name])
        else:
            Is.append([word not in f for f in df.f_name])
    index=[]
    for r in range(len(df)):
        flag=not include
        for c in range(len(words)):
            if include:
                flag = flag or Is[c][r]
            else:
                flag = flag and Is[c][r]
        index.append(flag)
    df2 = df[index]
    return(df2)

df_all = filter_links(['摘要','询问函','社会责任','审计','财务','风险','债券'],df,include=False)
df_orig = filter_links(['(','('], df_all,include=False)
df_updt = filter_links(['(','('], df_all,include=True)
df_updt = filter_links(['取消'],df_updt,include=False)
In [3]:
def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig = []
    index_updt = []
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:  
                index_orig.append(i)
                index_updt.append(j)        
    #return((index_orig,index_updt))
    for n in range(len(index_orig)):
        i = index_orig[n]
        j = index_updt[n]
        df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
        #df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
    return(df_newest)

df_newest = sub_with_update(df_updt,df_orig)
#index_orig,index_updt = sub_with_update(df_updt, df_orig)
            
df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]

counts = df_newest['公司简称'].value_counts()
In [4]:
ten_company = []
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn], df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn = df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)  
    
ten_csv = os.listdir('10companies')

第三步:提取下载链接 下载pdf

In [5]:
import re
import pandas as pd
In [ ]:
names = input().strip().split( )
names = [ './10companies/%s.csv' % name for name in names]
df_list = [pd.read_csv(name) for name in names]
In [1]:
import fitz
import os
os.chdir(r"C:\Users\dell\Desktop\wanxinqi2201408\final project\10companies")
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'

pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
Out[1]:
['2016-04-22-002486.SZ-嘉麟杰:2015年年度报告.pdf',
 '2017-03-20-002486.SZ-嘉麟杰:2016年年度报告.pdf',
 '2018-04-17-002486.SZ-嘉麟杰:2017年年度报告.pdf',
 '2019-03-28-002486.SZ-嘉麟杰:2018年年度报告.pdf',
 '2020-04-29-002486.SZ-嘉麟杰:2019年年度报告.pdf',
 '2021-04-30-002486.SZ-嘉麟杰:2020年年度报告.pdf',
 '600107湖北美尔雅股份有限公司2020年年度报告.pdf',
 '600107美尔雅2016年年度报告.pdf',
 '600107美尔雅2017年年度报告.pdf',
 '600107美尔雅2019年年度报告.pdf',
 '600107: 美尔雅2015年年度报告.pdf',
 '600156华升股份2016年年度报告.pdf',
 '600156华升股份2018年年度报告.pdf',
 '600156华升股份2019年年度报告.pdf',
 '600156华升股份2020年年度报告.pdf',
 '600220江苏阳光2016年年度报告.pdf',
 '600220江苏阳光2019年年度报告.pdf',
 '600220江苏阳光股份有限公司2020年年度报告.pdf',
 '600689上海三毛2017年年度报告.pdf',
 '600689上海三毛2018年年度报告.pdf',
 '600689上海三毛2019年年度报告.pdf',
 '600689上海三毛企业(集团)股份有限公司2020年年度报告.pdf',
 '孚日股份:2016年年度报告.pdf',
 '孚日股份:2017年年度报告.pdf',
 '孚日股份:2018年年度报告.pdf',
 '孚日股份:2020年年度报告.pdf',
 '新野纺织:2015年年度报告.pdf',
 '新野纺织:2016年年度报告.pdf',
 '新野纺织:2017年年度报告.pdf',
 '新野纺织:2018年年度报告.pdf',
 '新野纺织:2019年年度报告.pdf',
 '新野纺织:2020年年度报告.pdf',
 '棒杰股份:2016年年度报告.pdf',
 '棒杰股份:2017年年度报告.pdf',
 '棒杰股份:2018年年度报告.pdf',
 '棒杰股份:2019年年度报告.pdf',
 '棒杰股份:2020年年度报告.pdf',
 '浔兴股份:2015年年度报告.pdf',
 '浔兴股份:2016年年度报告.pdf',
 '浔兴股份:2017年年度报告.pdf',
 '浔兴股份:2018年年度报告.pdf',
 '浔兴股份:2019年年度报告.pdf',
 '浔兴股份:2020年年度报告.pdf',
 '联发股份:2015年年度报告.pdf',
 '联发股份:2016年年度报告.pdf',
 '联发股份:2017年年度报告.pdf',
 '联发股份:2018年年度报告.pdf',
 '联发股份:2019年年度报告.pdf',
 '联发股份:2020年年度报告.pdf']
In [31]:
import fitz
import os
import re
import pandas as pd
In [2]:
def extract_data(pdf):
    idx = pdf.find(':')  
    company_name = pdf[0:idx]
    year = pdf[idx+1:idx+5]
    
    doc = fitz.open(pdf)  
    text = [page.get_text() for page in doc]  
    text = ''.join(text)  
    
    p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)')  
    section_match = p_s.search(text)  
    s_idx = section_match.start()  
    
    p = re.compile('营业收入(.*?)归属于',re.DOTALL) 
    data_line = p.search(text[s_idx:]).group() 
    data_line = data_line.replace('\n', '') 
    p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}') 
    revenue = p_digit.search(data_line).group() 
    revenue = revenue.replace(',','') 
    return((company_name,year,revenue))
In [32]:
companies, years, revenues = [],[],[]
for pdf in pdf_list:
    company, year, revenue = extract_data(pdf)
    companies.append(company)
    years.append(year)
    revenues.append(revenue)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-32-e3d51f77a08f> in <module>()
      1 companies, years, revenues = [],[],[]
----> 2 for pdf in pdf_list:
      3     company, year, revenue = extract_data(pdf)
      4     companies.append(company)
      5     years.append(year)

NameError: name 'pdf_list' is not defined
In [28]:
df = pd.DataFrame({'company': companies,
                  'year': years,
                  'revenue': revenues})
In [27]:
df.sort_values('company',ignore_index=True, inplace=True)
df
Out[27]:
company year revenue

company year revenue 0 2016-04-22-002486.SZ-嘉麟杰 2015 700825174.66 1 2017-03-20-002486.SZ-嘉麟杰 2016 727895014.30 2 2018-04-17-002486.SZ-嘉麟杰 2017 883285616.26 3 2019-03-28-002486.SZ-嘉麟杰 2018 879139974.80 4 2020-04-29-002486.SZ-嘉麟杰 2019 1005356349.08 5 2021-04-30-002486.SZ-嘉麟杰 2020 1171293263.40 6 600107 美尔雅2018年度报告 470953799.24 7 600107湖北美尔雅股份有限公司2020年年度报告.pd 6001 338918564.50 8 600107美尔雅2016年年度报告.pd 6001 434337009.00 9 600107美尔雅2017年年度报告.pd 6001 443660878.19 10 600107美尔雅2019年年度报告.pd 6001 447250852.08 11 600156华升股份2016年年度报告.pd 6001 593146428.82 12 600156华升股份2018年年度报告.pd 6001 1016859661.35 13 600156华升股份2019年年度报告.pd 6001 103824.45 14 600156华升股份2020年年度报告.pd 6001 91353.87 15 600220江苏阳光2016年年度报告.pd 6002 2092172271.84 16 600220江苏阳光2019年年度报告.pd 6002 2349026226.65 17 600220江苏阳光股份有限公司2020年年度报告.pd 6002 1972351946.25 18 600689上海三毛2017年年度报告.pd 6006 1277461194.16 19 600689上海三毛2018年年度报告.pd 6006 1378099486.15 20 600689上海三毛2019年年度报告.pd 6006 1369543327.65 21 600689上海三毛企业(集团)股份有限公司2020年年度报告.pd 6006 945417541.82 22 孚日股份 2020 4432082348.81 23 孚日股份 2018 5170527632.84 24 孚日股份 2016 4374976362.28 25 孚日股份 2017 4821675351.22 26 新野纺织 2015 3046708978.24 27 新野纺织 2016 4085467260.70 28 新野纺织 2017 5195202111.76 29 新野纺织 2018 6059889289.10 30 新野纺织 2019 5732043894.74 31 新野纺织 2020 4865937958.66 32 棒杰股份 2020 625631302.42 33 棒杰股份 34 棒杰股份 2018 415512906.25 35 棒杰股份 2017 434094913.54 36 棒杰股份 2016 389873815.96 37 浔兴股份 2015 1041456861.05 38 浔兴股份 2016 1175490158.10 39 浔兴股份 2017 1859901688.90 40 浔兴股份 2018 2272495886.73 41 浔兴股份 2019 1919149402.77 42 浔兴股份 2020 1589919387.95 43 联发股份 2018 4168693635.91 44 联发股份 2017 4003621033.77 45 联发股份 2019 3910527325.36 46 联发股份 2015 3476970839.03 47 联发股份 2016 3738610953.91 48 联发股份 2020 3873259434.39

绘图

In [12]:
import numpy as np 
from matplotlib import pyplot as plt 

嘉麟杰股份绘图

In [24]:
df_jlj = df[0:6]
df_jlj
Out[24]:
company year revenue
In [35]:
df2_jlj = df_jlj.sort_values('year',ignore_index=True)
y_jlj = df2_jlj['year']
y_jlj
r_jlj = df2_jlj['revenue']
r_jlj 
Out[35]:
Series([], Name: revenue, dtype: float64)

0 700825174.66 1 727895014.30 2 883285616.26 3 879139974.80 4 1005356349.08 5 1171293263.40 Name: revenue, dtype: object

In [36]:
x = [2015,2016,2017,2018,2019,2020]
y = [700825174.66 , 727895014.30,883285616.26, 879139974.80,1005356349.08,1171293263.40]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

美尔雅股份

In [37]:
df_mry = df[6:12]
df2_mry = df_mry.sort_values('year',ignore_index=True)
y_mry = df2_mry['year']
r_mry = df2_mry['revenue']
r_mry
Out[37]:
Series([], Name: revenue, dtype: float64)

0 470953799.24 1 338918564.50 2 434337009.00 3 443660878.19 4 447250852.08 5 593146428.82 Name: revenue, dtype: object

In [38]:
x = [2015,2016,2017,2018,2019,2020]
y = [470953799.24,338918564.50,434337009.00,443660878.19,447250852.08,593146428.82]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

华升股份

In [40]:
df_hs = df[13:18]
df2_hs = df_hs.sort_values('year',ignore_index=True)
y_hs = df2_hs['year']
r_hs = df2_hs['revenue']
r_hs
Out[40]:
Series([], Name: revenue, dtype: float64)

0 103824.45 1 91353.87 2 2092172271.84 3 2349026226.65 4 1972351946.25 Name: revenue, dtype: object

In [41]:
x = [2016,2017,2018,2019,2020]
y = [ 103824.45,91353.87,2092172271.84,2349026226.65,1972351946.25]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

江苏阳光股份有限公司

In [43]:
df_jsyg = df[18:23]
df2_jsyg = df_jsyg.sort_values('year',ignore_index=True)
y_jsyg = df2_jsyg['year']
r_jsyg = df2_jsyg['revenue']
r_jsyg
Out[43]:
Series([], Name: revenue, dtype: float64)

0 4432082348.81 1 1277461194.16 2 1378099486.15 3 1369543327.65 4 945417541.82 Name: revenue, dtype: object

In [44]:
x = [2015,2016,2017,2019,2020]
y = [4432082348.81,1277461194.16,1378099486.15,1369543327.65,945417541.82]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

上海三毛企业(集团)股份有限公司

In [ ]:
df_shsm = df[24:29]
df2_shsm = df_shsm.sort_values('year',ignore_index=True)
y_shsm = df2_shsm['year']
r_shsm = df2_shsm['revenue']
r_shsm

0 3046708978.24 1 4374976362.28 2 4085467260.70 3 4821675351.22 4 5195202111.76 Name: revenue, dtype: object

In [45]:
x = [2016,2017,2018,2019,2020]
y = [3046708978.24,4374976362.28,4085467260.70,4821675351.22,5195202111.76]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

孚日股份

In [46]:
df_fr = df[30:34]
df2_fr = df_fr.sort_values('year',ignore_index=True)
y_fr = df2_fr['year']
r_fr = df2_fr['revenue']
r_fr
Out[46]:
Series([], Name: revenue, dtype: float64)

0 5732043894.74 1 600896507.81 2 4865937958.66 3 625631302.42 Name: revenue, dtype: object

In [51]:
x = [2017,2018,2019,2020]
y = [5732043894.74,600896507.81,4865937958.66,625631302.42]
plt.title('revenue changing during 2017-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

新野纺织

In [60]:
df_xy = df[26:31]
df2_xy = df_xy.sort_values('year',ignore_index=True)
y_xy = df2_xy['year']
r_xy = df2_xy['revenue']
r_xy
Out[60]:
Series([], Name: revenue, dtype: float64)

0 1041456861.05 1 389873815.96 2 1175490158.10 3 434094913.54 4 1859901688.90 Name: revenue, dtype: object

In [54]:
x = [2015,2016,2017,2018,2019,2020]
y = [1041456861.05,389873815.96,1175490158.10,434094913.54,1859901688.90]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-54-a5edb07dac07> in <module>()
      4 plt.xlabel("year")
      5 plt.ylabel("revenue")
----> 6 plt.plot(x,y)
      7 plt.show()

~\Anaconda3\lib\site-packages\matplotlib\pyplot.py in plot(*args, **kwargs)
   3361                       mplDeprecation)
   3362     try:
-> 3363         ret = ax.plot(*args, **kwargs)
   3364     finally:
   3365         ax._hold = washold

~\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, *args, **kwargs)
   1865                         "the Matplotlib list!)" % (label_namer, func.__name__),
   1866                         RuntimeWarning, stacklevel=2)
-> 1867             return func(ax, *args, **kwargs)
   1868 
   1869         inner.__doc__ = _add_data_doc(inner.__doc__,

~\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in plot(self, *args, **kwargs)
   1526         kwargs = cbook.normalize_kwargs(kwargs, _alias_map)
   1527 
-> 1528         for line in self._get_lines(*args, **kwargs):
   1529             self.add_line(line)
   1530             lines.append(line)

~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _grab_next_args(self, *args, **kwargs)
    404                 this += args[0],
    405                 args = args[1:]
--> 406             for seg in self._plot_args(this, kwargs):
    407                 yield seg
    408 

~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _plot_args(self, tup, kwargs)
    381             x, y = index_of(tup[-1])
    382 
--> 383         x, y = self._xy_from_xy(x, y)
    384 
    385         if self.command == 'plot':

~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _xy_from_xy(self, x, y)
    240         if x.shape[0] != y.shape[0]:
    241             raise ValueError("x and y must have same first dimension, but "
--> 242                              "have shapes {} and {}".format(x.shape, y.shape))
    243         if x.ndim > 2 or y.ndim > 2:
    244             raise ValueError("x and y can be no greater than 2-D, but have "

ValueError: x and y must have same first dimension, but have shapes (6,) and (5,)

棒杰股份

In [ ]:
df_hs = df[:36]
df2_hs = df_hs.sort_values('year',ignore_index=True)
y_hs = df2_hs['year']
r_hs = df2_hs['revenue']
r_hs
In [55]:
x = [2015,2016,2017,2018,2019,2020]
y = [470953799.24,338918564.50,434337009.00,443660878.19,447250852.08,593146428.82]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

浔兴股份

In [ ]:
df_xx = df[37:42]
df2_xx = df_xx.sort_values('year',ignore_index=True)
y_xx = df2_xx['year']
r_xx = df2_xx['revenue']
r_xx
In [59]:
x = [2015,2016,2017,2018,2019,2020]
y = [1041456861.05,1175490158.10,1859901688.90,2272495886.73,1919149402.77,1589919387.95]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

联发股份

In [57]:
df_lf = df[43:48]
df2_lf = df_lf.sort_values('year',ignore_index=True)
y_lf = df2_lf['year']
r_lf = df2_lf['revenue']
r_lf
Out[57]:
Series([], Name: revenue, dtype: float64)
In [58]:
x = [2015,2016,2017,2018,2019,2020]
y = [3476970839.03,3738610953.91,4003621033.77,4168693635.91,3910527325.36,3873259434.39]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()